Zip Ye commited on 8 days ago

Commit

fa1aa1c

1 Parent(s): 399c281

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +8 -11
draft_probe_suite/draft_model/config.json +38 -0
draft_probe_suite/draft_model/model.safetensors +3 -0
draft_probe_suite/pretrained_draft_model/config.json +38 -0
draft_probe_suite/pretrained_draft_model/model.safetensors +3 -0
draft_probe_suite/probe/config.json +1 -0
draft_probe_suite/probe/state_dict.pth +3 -0
sglang/README.md +17 -0
sglang/__init__.py +83 -0
sglang/bench_offline_throughput.py +476 -0
sglang/bench_one_batch.py +795 -0
sglang/bench_one_batch_server.py +605 -0
sglang/bench_serving.py +0 -0
sglang/check_env.py +433 -0
sglang/cli/__init__.py +0 -0
sglang/cli/generate.py +33 -0
sglang/cli/main.py +26 -0
sglang/cli/serve.py +75 -0
sglang/cli/utils.py +152 -0
sglang/compile_deep_gemm.py +191 -0
sglang/eval/llama3_eval.py +315 -0
sglang/eval/loogle_eval.py +164 -0
sglang/global_config.py +29 -0
sglang/jit_kernel/.clang-format +19 -0
sglang/jit_kernel/__pycache__/hicache.cpython-311.pyc +0 -0
sglang/jit_kernel/__pycache__/utils.cpython-311.pyc +0 -0
sglang/jit_kernel/csrc/cuda_wait_value.cuh +38 -0
sglang/jit_kernel/csrc/hicache.cuh +264 -0
sglang/jit_kernel/cuda_wait_value.py +79 -0
sglang/jit_kernel/hicache.py +138 -0
sglang/jit_kernel/include/sgl_kernel/tensor.h +487 -0
sglang/jit_kernel/include/sgl_kernel/utils.cuh +101 -0
sglang/jit_kernel/include/sgl_kernel/utils.h +88 -0
sglang/jit_kernel/include/sgl_kernel/warp.cuh +145 -0
sglang/jit_kernel/utils.py +103 -0
sglang/lang/__pycache__/api.cpython-311.pyc +0 -0
sglang/lang/__pycache__/chat_template.cpython-311.pyc +0 -0
sglang/lang/__pycache__/choices.cpython-311.pyc +0 -0
sglang/lang/__pycache__/interpreter.cpython-311.pyc +0 -0
sglang/lang/__pycache__/ir.cpython-311.pyc +0 -0
sglang/lang/api.py +292 -0
sglang/lang/backend/__pycache__/base_backend.cpython-311.pyc +0 -0
sglang/lang/backend/__pycache__/runtime_endpoint.cpython-311.pyc +0 -0
sglang/lang/backend/anthropic.py +73 -0
sglang/lang/backend/base_backend.py +82 -0
sglang/lang/backend/litellm.py +90 -0
sglang/lang/backend/openai.py +475 -0
sglang/lang/backend/runtime_endpoint.py +527 -0
sglang/lang/backend/vertexai.py +148 -0
sglang/lang/chat_template.py +668 -0

README.md CHANGED Viewed

@@ -1,11 +1,10 @@
 <div align="center">
-  <img src="assets/logo.png" alt="SEAGLE Logo" width="200"/>
 </div>
-# SEAGLE: Safety-Aware EAGLE
 **SEAGLE** is a safety-aware speculative decoding policy based on [SGLang](https://github.com/sgl-project/sglang). It embeds a lightweight probe model into the draft loop of [EAGLE-3](https://github.com/SafeAILab/EAGLE) speculative decoding, performs real-time safety monitoring on each decoding step, dynamically adjusts draft tokens, and triggers a fallback mechanism when unsafe content is continuously detected.
-> [HERE](https://www.modelscope.cn/models/Alibaba-AAIG/SEAGLE) is the link to the source code and model weights.
 ![Leaderboard](assets/leaderboard.jpg)
@@ -76,7 +75,7 @@ from sglang.srt.server_args import ServerArgs
 from sglang.srt.entrypoints.http_server import launch_server as _launch_server
 # =========================================================
-# Launch SGlang Server with Safety-Aware Eagle3 Decoding
 # =========================================================
 MODEL_PATH = "your_qwen3_235b_a22b_instruct_2507_path"
 DRAFT_MODEL_PATH = "draft_probe_suite/draft_model"
@@ -246,7 +245,7 @@ We begin by evaluating the acceleration performance of our draft models, encompa
 > **Note:** Our pre-trained draft model can be found [here](https://www.modelscope.cn/models/Alibaba-AAIG/SEAGLE/tree/master/draft_probe_suite/pretrained_draft_model). Compared to the [Meituan](https://modelscope.cn/models/lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge-Meituan) version, our Eagle Head has undergone accelerated training specifically for Chinese. The pre-trained version can be used standalone as an Eagle Head for Qwen3-235B-A22B-Instruct-2507, delivering outstanding acceleration performance in both Chinese and English.
-**Launch with Standard SGLang CLI:**
 ```bash
 export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
@@ -282,11 +281,9 @@ Evaluate the probe's impact on normal chatting data (query safety & response saf
 | :--- | :---: | :---: | :---: |
 | FuseChat-Mixture | 50,000 | 0.99506 | 0.00494 |
-> **Note:** Even if the probe occasionally produces false positives, the safety-aware speculative decoding mechanism still ensures that the generated responses are meaningful and valuable.
-### ⚖️ 3.3 End-to-End Utility and Safety
-The trained probe is integrated into the Eagle3 decoding pipeline. We evaluate the end-to-end utility and safety of the SafeAware decoding strategy using an SGLang single-request configuration.
 #### (1) Utility Performance
@@ -315,8 +312,8 @@ Safety scores are evaluated based on the discriminative reward model (DRM), gene
 | 📎 [Chinese: 100 High-Risk](assets/valuesTest_zh_hard_100.jsonl) | DRM Score | 0.43 | 0.49 | **0.83** |
 | | QwQ Score | 0.70 | 0.70 | **0.92** |
 | | GRM Score | 0.23 | 0.31 | **0.81** |
-| 📊 [English Log](assets/GRM_judge_log_en.xlsx) | Logs | ✅ | - | ✅ |
-| 📊 [Chinese Log](assets/GRM_judge_log_zh.xlsx) | Logs | ✅ | - | ✅ |
 ---

 <div align="center">
+  <img src="assets/logo.png" alt="SEAGLE Logo" width="250"/>
 </div>
+# SEAGLE: Safe-Aware EAGLE
 **SEAGLE** is a safety-aware speculative decoding policy based on [SGLang](https://github.com/sgl-project/sglang). It embeds a lightweight probe model into the draft loop of [EAGLE-3](https://github.com/SafeAILab/EAGLE) speculative decoding, performs real-time safety monitoring on each decoding step, dynamically adjusts draft tokens, and triggers a fallback mechanism when unsafe content is continuously detected.
 ![Leaderboard](assets/leaderboard.jpg)
 from sglang.srt.entrypoints.http_server import launch_server as _launch_server
 # =========================================================
+# Launch SGlang Server with Safe-Aware Eagle3 Decoding
 # =========================================================
 MODEL_PATH = "your_qwen3_235b_a22b_instruct_2507_path"
 DRAFT_MODEL_PATH = "draft_probe_suite/draft_model"
 > **Note:** Our pre-trained draft model can be found [here](https://www.modelscope.cn/models/Alibaba-AAIG/SEAGLE/tree/master/draft_probe_suite/pretrained_draft_model). Compared to the [Meituan](https://modelscope.cn/models/lmsys/SGLang-EAGLE3-Qwen3-235B-A22B-Instruct-2507-SpecForge-Meituan) version, our Eagle Head has undergone accelerated training specifically for Chinese. The pre-trained version can be used standalone as an Eagle Head for Qwen3-235B-A22B-Instruct-2507, delivering outstanding acceleration performance in both Chinese and English.
+**Launch with Standard SGLang Command:**
 ```bash
 export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
 | :--- | :---: | :---: | :---: |
 | FuseChat-Mixture | 50,000 | 0.99506 | 0.00494 |
+### ⚖️ 3.3 Utility and Safety
+The trained probe is integrated into the Eagle3 decoding pipeline. Using an SGLang & Single Request configuration, the general utility and security of the SafeAware decoding strategy are evaluated.
 #### (1) Utility Performance
 | 📎 [Chinese: 100 High-Risk](assets/valuesTest_zh_hard_100.jsonl) | DRM Score | 0.43 | 0.49 | **0.83** |
 | | QwQ Score | 0.70 | 0.70 | **0.92** |
 | | GRM Score | 0.23 | 0.31 | **0.81** |
+| 📊 [English Log](assets/GRM_judge_log_en.xlsx) | Evaluation | ✅ | - | ✅ |
+| 📊 [Chinese Log](assets/GRM_judge_log_zh.xlsx) | Evaluation | ✅ | - | ✅ |
 ---

draft_probe_suite/draft_model/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "draft_vocab_size": 32000,
+  "dtype": "bfloat16",
+  "eagle_config": {
+    "eagle_aux_hidden_state_layer_ids": [
+      1,
+      46,
+      90
+    ],
+    "use_aux_hidden_state": true
+  },
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 24576,
+  "max_position_embeddings": 40960,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "vocab_size": 151936
+}

draft_probe_suite/draft_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:855968a019ea7ce5d33cdc503737c4fe19fd3aa8a176440d818506bf018f130d
+size 1185333104

draft_probe_suite/pretrained_draft_model/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "draft_vocab_size": 32000,
+  "dtype": "bfloat16",
+  "eagle_config": {
+    "eagle_aux_hidden_state_layer_ids": [
+      1,
+      46,
+      90
+    ],
+    "use_aux_hidden_state": true
+  },
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 24576,
+  "max_position_embeddings": 40960,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "vocab_size": 151936
+}

draft_probe_suite/pretrained_draft_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4b0c81dfb283d27ab78fbdffffe1b0962d6fc3ef4bce16dbc4d6561ef19a9b1
+size 1185333104

draft_probe_suite/probe/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"input_size": 4096, "output_size": 1, "intermediate_size": 1024}

draft_probe_suite/probe/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bee888de61324d261765237d29a23f7d4858fb6c571bbe58726321a8be0b8d26
+size 16803511

sglang/README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# Code Structure
+- `eval`: The evaluation utilities.
+- `lang`: The frontend language.
+- `srt`: The backend engine for running local models. (SRT = SGLang Runtime).
+- `test`: The test utilities.
+- `api.py`: The public APIs.
+- `bench_offline_throughput.py`: Benchmark the performance in the offline mode.
+- `bench_one_batch.py`: Benchmark the latency of running a single static batch without a server.
+- `bench_one_batch_server.py`: Benchmark the latency of running a single batch with a server.
+- `bench_serving.py`: Benchmark online serving with dynamic requests.
+- `check_env.py`: Check the environment variables and dependencies.
+- `global_config.py`: The global configs and constants.
+- `launch_server.py`: The entry point for launching a local server.
+- `profiler.py`: The profiling entry point to send profile requests.
+- `utils.py`: Common utilities.
+- `version.py`: Version info.

sglang/__init__.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# SGLang public APIs
+# Frontend Language APIs
+from sglang.global_config import global_config
+from sglang.lang.api import (
+    Engine,
+    Runtime,
+    assistant,
+    assistant_begin,
+    assistant_end,
+    flush_cache,
+    function,
+    gen,
+    gen_int,
+    gen_string,
+    get_server_info,
+    image,
+    select,
+    separate_reasoning,
+    set_default_backend,
+    system,
+    system_begin,
+    system_end,
+    user,
+    user_begin,
+    user_end,
+    video,
+)
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.lang.choices import (
+    greedy_token_selection,
+    token_length_normalized,
+    unconditional_likelihood_normalized,
+)
+# Lazy import some libraries
+from sglang.utils import LazyImport
+from sglang.version import __version__
+Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
+LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
+OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
+VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
+# Runtime Engine APIs
+ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
+Engine = LazyImport("sglang.srt.entrypoints.engine", "Engine")
+__all__ = [
+    "Engine",
+    "Runtime",
+    "assistant",
+    "assistant_begin",
+    "assistant_end",
+    "flush_cache",
+    "function",
+    "gen",
+    "gen_int",
+    "gen_string",
+    "get_server_info",
+    "image",
+    "select",
+    "separate_reasoning",
+    "set_default_backend",
+    "system",
+    "system_begin",
+    "system_end",
+    "user",
+    "user_begin",
+    "user_end",
+    "video",
+    "RuntimeEndpoint",
+    "greedy_token_selection",
+    "token_length_normalized",
+    "unconditional_likelihood_normalized",
+    "ServerArgs",
+    "Anthropic",
+    "LiteLLM",
+    "OpenAI",
+    "VertexAI",
+    "global_config",
+    "__version__",
+]

sglang/bench_offline_throughput.py ADDED Viewed

	@@ -0,0 +1,476 @@

+"""
+Benchmark the throughput in the offline mode.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
+# Usage
+## Sharegpt dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
+## Random dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
+"""
+import argparse
+import asyncio
+import dataclasses
+import inspect
+import json
+import logging
+import os
+import random
+import time
+from typing import Dict, List, Optional
+import numpy as np
+from sglang.bench_serving import (
+    DatasetRow,
+    get_dataset,
+    get_tokenizer,
+    sample_random_requests,
+    set_ulimit,
+)
+from sglang.lang.backend.runtime_endpoint import Runtime
+from sglang.srt.entrypoints.engine import Engine
+from sglang.srt.server_args import ServerArgs
+@dataclasses.dataclass
+class BenchArgs:
+    backend: str = "engine"
+    result_filename: str = ""
+    dataset_name: str = "sharegpt"
+    dataset_path: str = ""
+    num_prompts: int = 1000
+    sharegpt_output_len: Optional[int] = None
+    sharegpt_context_len: Optional[int] = None
+    random_input_len: int = 1024
+    random_output_len: int = 1024
+    random_range_ratio: float = 0.0
+    gsp_num_groups: int = 64
+    gsp_prompts_per_group: int = 16
+    gsp_system_prompt_len: int = 2048
+    gsp_question_len: int = 128
+    gsp_output_len: int = 256
+    seed: int = 1
+    disable_ignore_eos: bool = False
+    extra_request_body: Optional[str] = None
+    apply_chat_template: bool = False
+    profile: bool = False
+    skip_warmup: bool = False
+    do_not_exit: bool = False
+    prompt_suffix: str = ""
+    return_logprob: bool = False
+    logprob_start_len: int = -1
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--backend", type=str, default=BenchArgs.backend)
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+        parser.add_argument(
+            "--dataset-name",
+            type=str,
+            default="sharegpt",
+            choices=["sharegpt", "random", "generated-shared-prefix"],
+            help="Name of the dataset to benchmark on.",
+        )
+        parser.add_argument(
+            "--dataset-path", type=str, default="", help="Path to the dataset."
+        )
+        parser.add_argument(
+            "--num-prompts",
+            type=int,
+            default=BenchArgs.num_prompts,
+            help="Number of prompts to process. Default is 1000.",
+        )
+        parser.add_argument(
+            "--sharegpt-output-len",
+            type=int,
+            default=BenchArgs.sharegpt_output_len,
+            help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
+        )
+        parser.add_argument(
+            "--sharegpt-context-len",
+            type=int,
+            default=BenchArgs.sharegpt_context_len,
+            help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+        )
+        parser.add_argument(
+            "--random-input-len",
+            type=int,
+            default=BenchArgs.random_input_len,
+            help="Number of input tokens per request, used only for random dataset.",
+        )
+        parser.add_argument(
+            "--random-output-len",
+            type=int,
+            default=BenchArgs.random_output_len,
+            help="Number of output tokens per request, used only for random dataset.",
+        )
+        parser.add_argument(
+            "--random-range-ratio",
+            type=float,
+            default=BenchArgs.random_range_ratio,
+            help="Range of sampled ratio of input/output length, "
+            "used only for random dataset.",
+        )
+        parser.add_argument(
+            "--gsp-num-groups",
+            type=int,
+            default=BenchArgs.gsp_num_groups,
+            help="Number of groups with shared prefix, used"
+            "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gsp-prompts-per-group",
+            type=int,
+            default=BenchArgs.gsp_prompts_per_group,
+            help="Number of prompts per group of shared prefix, used"
+            "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gsp-system-prompt-len",
+            type=int,
+            default=BenchArgs.gsp_system_prompt_len,
+            help="System prompt length, used" "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gsp-question-len",
+            type=int,
+            default=BenchArgs.gsp_question_len,
+            help="Question length, used" "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gsp-output-len",
+            type=int,
+            default=BenchArgs.gsp_output_len,
+            help="Target length in tokens for outputs in generated-shared-prefix dataset",
+        )
+        parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+        parser.add_argument(
+            "--disable-ignore-eos",
+            action="store_true",
+            help="Disable ignore EOS token",
+        )
+        parser.add_argument(
+            "--extra-request-body",
+            metavar='{"key1": "value1", "key2": "value2"}',
+            type=str,
+            default=BenchArgs.extra_request_body,
+            help="Append given JSON object to the request payload. You can use this to specify"
+            "additional generate params like sampling params.",
+        )
+        parser.add_argument(
+            "--apply-chat-template",
+            action="store_true",
+            help="Apply chat template",
+        )
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
+        parser.add_argument(
+            "--skip-warmup",
+            action="store_true",
+            help="Skip the warmup batches.",
+        )
+        parser.add_argument(
+            "--do-not-exit",
+            action="store_true",
+            help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
+        )
+        parser.add_argument(
+            "--prompt-suffix",
+            type=str,
+            default="",
+            help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
+        )
+        parser.add_argument(
+            "--return-logprob",
+            action="store_true",
+            help="Enable returning log probabilities.",
+        )
+        parser.add_argument(
+            "--logprob-start-len",
+            type=int,
+            default=-1,
+            help="Start length for logprob. -1 means only return logprobs for output tokens (default). 0 means return logprobs for all tokens including input.",
+        )
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+def throughput_test_once(
+    backend_name: str,
+    backend,
+    reqs: List[DatasetRow],
+    ignore_eos: bool,
+    extra_request_body: Dict,
+    profile: bool,
+    return_logprob: bool = False,
+    logprob_start_len: int = -1,
+):
+    measurement_results = {
+        "backend": backend_name,
+        "successful_requests": len(reqs),
+        "total_latency": -1,
+        "total_input_tokens": sum(r.prompt_len for r in reqs),
+        "total_output_tokens": -1,
+        "request_throughput": -1,
+        "input_throughput": -1,
+        "output_throughput": -1,
+        "total_throughput": -1,
+    }
+    prompt = [r.prompt for r in reqs]
+    sampling_params = [
+        {
+            "temperature": 0,
+            "max_new_tokens": r.output_len,
+            "ignore_eos": ignore_eos,
+            **extra_request_body,
+        }
+        for r in reqs
+    ]
+    if profile:
+        assert (
+            "SGLANG_TORCH_PROFILER_DIR" in os.environ
+        ), "Please set SGLANG_TORCH_PROFILER_DIR."
+        os.makedirs(os.environ["SGLANG_TORCH_PROFILER_DIR"], exist_ok=True)
+        backend.start_profile()
+    st = time.perf_counter()
+    gen_out = backend.generate(
+        prompt=prompt,
+        sampling_params=sampling_params,
+        return_logprob=return_logprob,
+        logprob_start_len=logprob_start_len,
+    )
+    latency = time.perf_counter() - st
+    if profile:
+        dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
+        known_files = set(os.listdir(dir))
+        backend.stop_profile()
+        monitor_trace_file(known_files, dir)
+    if backend_name == "runtime":
+        gen_out = json.loads(gen_out)
+    server_info = backend.get_server_info()
+    measurement_results["total_latency"] = latency
+    measurement_results["total_output_tokens"] = sum(
+        o["meta_info"]["completion_tokens"] for o in gen_out
+    )
+    measurement_results["request_throughput"] = (
+        measurement_results["successful_requests"] / latency
+    )
+    measurement_results["input_throughput"] = (
+        measurement_results["total_input_tokens"] / latency
+    )
+    measurement_results["output_throughput"] = (
+        measurement_results["total_output_tokens"] / latency
+    )
+    measurement_results["total_throughput"] = (
+        measurement_results["total_input_tokens"]
+        + measurement_results["total_output_tokens"]
+    ) / latency
+    if inspect.isawaitable(server_info):
+        server_info = asyncio.run(server_info)
+    measurement_results["last_gen_throughput"] = server_info["internal_states"][0][
+        "last_gen_throughput"
+    ]
+    return measurement_results
+def monitor_trace_file(known_files, directory, interval=1):
+    print(f"Monitoring {directory} for new trace files...")
+    while True:
+        flag = False
+        time.sleep(interval)
+        current_files = set(os.listdir(directory))
+        new_files = current_files - known_files
+        for new_file in new_files:
+            new_file_path = os.path.join(directory, new_file)
+            print(f"New file detected: {new_file}")
+            previous_size = 0
+            while True:
+                try:
+                    current_size = os.path.getsize(new_file_path)
+                except FileNotFoundError:
+                    print(f"File {new_file} is no longer accessible.")
+                    break
+                if current_size > previous_size:
+                    previous_size = current_size
+                else:
+                    flag = True
+                    break
+                time.sleep(interval)
+        if flag:
+            break
+def throughput_test(
+    server_args: ServerArgs,
+    bench_args: BenchArgs,
+):
+    if bench_args.backend == "engine":
+        backend = Engine(**dataclasses.asdict(server_args))
+        if not backend:
+            raise ValueError("Please provide valid engine arguments")
+    elif bench_args.backend == "runtime":
+        backend = Runtime(**dataclasses.asdict(server_args))
+    else:
+        raise ValueError('Please set backend to either "engine" or "runtime"')
+    tokenizer_id = server_args.tokenizer_path or server_args.model_path
+    tokenizer = get_tokenizer(tokenizer_id)
+    # Set global environments
+    set_ulimit()
+    random.seed(bench_args.seed)
+    np.random.seed(bench_args.seed)
+    # Parse args
+    extra_request_body = {}
+    if bench_args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
+    # Read dataset
+    input_requests = get_dataset(bench_args, tokenizer)
+    warmup_requests = sample_random_requests(
+        input_len=256,
+        output_len=16,
+        num_prompts=min(bench_args.num_prompts, 16),
+        range_ratio=1.0,
+        tokenizer=tokenizer,
+        dataset_path=bench_args.dataset_path,
+    )
+    # Warm up
+    if not bench_args.skip_warmup:
+        logging.info("\nWarmup...")
+        throughput_test_once(
+            backend_name=bench_args.backend,
+            backend=backend,
+            reqs=warmup_requests,
+            ignore_eos=not bench_args.disable_ignore_eos,
+            extra_request_body=extra_request_body,
+            profile=False,
+            return_logprob=bench_args.return_logprob,
+            logprob_start_len=bench_args.logprob_start_len,
+        )
+        time.sleep(0.5)
+    logging.info("\nBenchmark...")
+    result = throughput_test_once(
+        backend_name=bench_args.backend,
+        backend=backend,
+        reqs=input_requests,
+        ignore_eos=not bench_args.disable_ignore_eos,
+        extra_request_body=extra_request_body,
+        profile=bench_args.profile,
+        return_logprob=bench_args.return_logprob,
+        logprob_start_len=bench_args.logprob_start_len,
+    )
+    backend.shutdown()
+    if bench_args.result_filename:
+        with open(bench_args.result_filename, "a") as fout:
+            fout.write(json.dumps(result) + "\n")
+    print(
+        "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
+    )
+    print("{:<40} {:<10}".format("Backend:", result["backend"]))
+    print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
+    print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
+    print(
+        "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Last generation throughput (tok/s):", result["last_gen_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", result["request_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", result["input_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", result["output_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", result["total_throughput"]
+        )
+    )
+    print("=" * 50)
+    return result
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    # handling ModelScope model downloads
+    if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() in ("true", "1"):
+        if os.path.exists(args.model_path):
+            print(f"Using local model path: {args.model_path}")
+        else:
+            try:
+                from modelscope import snapshot_download
+                print(f"Using ModelScope to download model: {args.model_path}")
+                # download the model and replace args.model_path
+                args.model_path = snapshot_download(
+                    args.model_path,
+                )
+                print(f"Model downloaded to: {args.model_path}")
+            except Exception as e:
+                print(f"ModelScope download failed: {str(e)}")
+                raise e
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format="%(message)s",
+    )
+    throughput_test(server_args, bench_args)
+    while bench_args.do_not_exit:
+        pass

sglang/bench_one_batch.py ADDED Viewed

	@@ -0,0 +1,795 @@

+"""
+Benchmark the latency of running a single static batch without a server.
+This script does not launch a server and uses the low-level APIs.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
+# Usage (latency test)
+## with dummy weights:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
+## sweep through multiple data points and store (append) the results in a jsonl file:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
+## run with profiling:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
+## run with profiling to custom directory:
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile
+## run with CUDA profiler (nsys):
+nsys profile --force-overwrite=true -o bench_one_batch python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile --profile-activities CUDA_PROFILER
+# Usage (correctness test):
+python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
+## Reference output (of the correctness test above, can be gpu dependent):
+input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
+prefill logits (first half): tensor([[-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
+        [-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
+        [ -9.1875, -10.2500,   2.7129,  ...,  -4.3359,  -4.0664,  -4.1328]],
+       device='cuda:0')
+prefill logits (final): tensor([[-8.3125, -7.1172,  3.3457,  ..., -4.9570, -4.1328, -3.4141],
+        [-8.9141, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0781],
+        [-9.6328, -9.0547,  4.0195,  ..., -5.3047, -4.7148, -4.4570]],
+       device='cuda:0')
+========== Prompt 0 ==========
+<s> The capital of France is Paris.
+The capital of the United States is Washington, D.C.
+========== Prompt 1 ==========
+<s> The capital of the United Kindom is London.
+The capital of the United Kingdom is London.
+The capital of the
+========== Prompt 2 ==========
+<s> Today is a sunny day and I like to go for a walk in the park.
+I'm going to the park
+"""
+import argparse
+import copy
+import dataclasses
+import itertools
+import json
+import logging
+import multiprocessing
+import os
+import time
+from types import SimpleNamespace
+from typing import Tuple
+import numpy as np
+import torch
+import torch.distributed as dist
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed.parallel_state import destroy_distributed_environment
+from sglang.srt.entrypoints.engine import _set_envs_and_config
+from sglang.srt.layers.moe import initialize_moe_config
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.scheduler_dp_attn_mixin import prepare_mlp_sync_batch_raw
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    configure_logger,
+    get_bool_env_var,
+    is_cuda_alike,
+    is_xpu,
+    kill_process_tree,
+    maybe_reindex_device_id,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+    set_gpu_proc_affinity,
+    suppress_other_loggers,
+)
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
+profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
+    profiler_activity
+    for available, profiler_activity in [
+        (is_cuda_alike(), torch.profiler.ProfilerActivity.CUDA),
+        (is_xpu(), torch.profiler.ProfilerActivity.XPU),
+    ]
+    if available
+]
+def start_profile(profile_activities, profile_record_shapes=False, rank_print=print):
+    """
+    Abstracted function to start profiling based on profile_activities.
+    Returns profiler object (or None).
+    """
+    if "CUDA_PROFILER" in profile_activities:
+        try:
+            torch.cuda.cudart().cudaProfilerStart()
+            rank_print("CUDA Profiler started (nsys will begin capturing)")
+        except Exception as e:
+            rank_print(f"Failed to start CUDA profiler: {e}")
+        return None
+    else:
+        activities = []
+        if "CPU" in profile_activities:
+            activities.append(torch.profiler.ProfilerActivity.CPU)
+        if "GPU" in profile_activities:
+            activities.append(torch.profiler.ProfilerActivity.CUDA)
+        if activities:
+            profiler = torch.profiler.profile(
+                activities=activities,
+                with_stack=True,
+                record_shapes=profile_record_shapes,
+            )
+            profiler.start()
+            return profiler
+        return None
+def stop_profile(
+    profiler,
+    profile_activities,
+    rank_print=print,
+    save_trace=False,
+    trace_filename=None,
+    stage=None,
+):
+    """
+    Abstracted function to stop profiling based on profile_activities.
+    Optionally saves trace results and prints completion messages.
+    """
+    if "CUDA_PROFILER" in profile_activities:
+        try:
+            torch.cuda.cudart().cudaProfilerStop()
+            rank_print("CUDA Profiler stopped (nsys should dump traces)")
+        except Exception as e:
+            rank_print(f"Failed to stop CUDA profiler: {e}")
+    elif profiler is not None:
+        profiler.stop()
+    if save_trace:
+        if profiler is not None:
+            if trace_filename:
+                _save_profile_trace_results(profiler, trace_filename)
+                stage_desc = f"for {stage}" if stage else ""
+                rank_print(
+                    f"torch profiler chrome trace {stage_desc} saved to {trace_filename}"
+                )
+        if "CUDA_PROFILER" in profile_activities:
+            rank_print(f"CUDA profiler trace for {stage} completed")
+@dataclasses.dataclass
+class BenchArgs:
+    run_name: str = "default"
+    batch_size: Tuple[int] = (1,)
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (16,)
+    prompt_filename: str = ""
+    result_filename: str = "result.jsonl"
+    correctness_test: bool = False
+    # This is only used for correctness test
+    cut_len: int = 4
+    log_decode_step: int = 0
+    profile: bool = False
+    profile_record_shapes: bool = False
+    profile_activities: Tuple[str] = ("CPU", "GPU")
+    profile_stage: str = "all"
+    profile_filename_prefix: str = "profile"
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
+        parser.add_argument(
+            "--prompt-filename", type=str, default=BenchArgs.prompt_filename
+        )
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+        parser.add_argument("--correctness-test", action="store_true")
+        parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
+        parser.add_argument(
+            "--log-decode-step",
+            type=int,
+            default=BenchArgs.log_decode_step,
+            help="Log decode latency by step, default is set to zero to disable.",
+        )
+        parser.add_argument("--profile", action="store_true", help="Enable profiling.")
+        parser.add_argument(
+            "--profile-record-shapes",
+            action="store_true",
+            help="Record tensor shapes in profiling results.",
+        )
+        parser.add_argument(
+            "--profile-activities",
+            type=str,
+            nargs="+",
+            default=["CPU", "GPU"],
+            choices=["CPU", "GPU", "CUDA_PROFILER"],
+            help="Profiler activities: CPU, GPU, CUDA_PROFILER. If CPU/GPU, use torch profiler. If CUDA_PROFILER, use CUDA profiler.",
+        )
+        parser.add_argument(
+            "--profile-stage",
+            type=str,
+            default=BenchArgs.profile_stage,
+            choices=["all", "prefill", "decode"],
+            help="Which stage to profile: all, prefill, or decode only.",
+        )
+        parser.add_argument(
+            "--profile-filename-prefix",
+            type=str,
+            default=BenchArgs.profile_filename_prefix,
+            help="Prefix of the profiling file names. The full profiling result file(s) be "
+            '"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
+        )
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to cast the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+def load_model(server_args, port_args, gpu_id, tp_rank):
+    suppress_other_loggers()
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
+    moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
+    model_config = ModelConfig.from_server_args(server_args)
+    model_runner = ModelRunner(
+        model_config=model_config,
+        mem_fraction_static=server_args.mem_fraction_static,
+        gpu_id=gpu_id,
+        tp_rank=tp_rank,
+        tp_size=server_args.tp_size,
+        moe_ep_rank=moe_ep_rank,
+        moe_ep_size=server_args.ep_size,
+        pp_rank=0,
+        pp_size=1,
+        nccl_port=port_args.nccl_port,
+        server_args=server_args,
+    )
+    rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
+    tokenizer = get_tokenizer(
+        server_args.tokenizer_path,
+        tokenizer_mode=server_args.tokenizer_mode,
+        trust_remote_code=server_args.trust_remote_code,
+    )
+    if server_args.tp_size > 1:
+        dist.barrier()
+    return model_runner, tokenizer
+def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
+    prompts = (
+        custom_prompts
+        if custom_prompts
+        else [
+            "The capital of France is",
+            "The capital of the United Kindom is",
+            "Today is a sunny day and I like",
+        ]
+    )
+    input_ids = [tokenizer.encode(p) for p in prompts]
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_new_tokens=BenchArgs.output_len,
+    )
+    reqs = []
+    for i in range(len(prompts)):
+        assert len(input_ids[i]) > bench_args.cut_len
+        tmp_input_ids = input_ids[i][: bench_args.cut_len]
+        req = Req(
+            rid=i,
+            origin_input_text=prompts[i],
+            origin_input_ids=tmp_input_ids,
+            sampling_params=sampling_params,
+        )
+        req.fill_ids = req.origin_input_ids
+        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
+        reqs.append(req)
+    return input_ids, reqs
+def prepare_extend_inputs_for_correctness_test(
+    bench_args, input_ids, reqs, model_runner
+):
+    for i in range(len(reqs)):
+        req = reqs[i]
+        req.fill_ids += input_ids[i][bench_args.cut_len :]
+        req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
+            i, : bench_args.cut_len
+        ]
+        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
+    return reqs
+def prepare_synthetic_inputs_for_latency_test(
+    batch_size, input_len, custom_inputs=None
+):
+    input_ids = (
+        custom_inputs
+        if custom_inputs
+        else np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_new_tokens=BenchArgs.output_len,
+    )
+    reqs = []
+    for i in range(len(input_ids)):
+        req = Req(
+            rid=i,
+            origin_input_text="",
+            origin_input_ids=list(input_ids[i]),
+            sampling_params=sampling_params,
+        )
+        req.fill_ids = req.origin_input_ids
+        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
+        reqs.append(req)
+    return reqs
+@torch.no_grad
+def extend(reqs, model_runner):
+    # Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
+    dummy_tree_cache = SimpleNamespace(
+        page_size=model_runner.server_args.page_size,
+        device=model_runner.device,
+        token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
+    )
+    batch = ScheduleBatch.init_new(
+        reqs=reqs,
+        req_to_token_pool=model_runner.req_to_token_pool,
+        token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
+        tree_cache=dummy_tree_cache,
+        model_config=model_runner.model_config,
+        enable_overlap=False,
+        spec_algorithm=SpeculativeAlgorithm.NONE,
+    )
+    batch.prepare_for_extend()
+    _maybe_prepare_mlp_sync_batch(batch, model_runner)
+    model_worker_batch = batch.get_model_worker_batch()
+    forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
+    logits_output, _ = model_runner.forward(forward_batch)
+    next_token_ids = model_runner.sample(logits_output, forward_batch)
+    return next_token_ids, logits_output.next_token_logits, batch
+@torch.no_grad
+def decode(input_token_ids, batch, model_runner):
+    batch.output_ids = input_token_ids
+    batch.prepare_for_decode()
+    _maybe_prepare_mlp_sync_batch(batch, model_runner)
+    model_worker_batch = batch.get_model_worker_batch()
+    forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
+    logits_output, _ = model_runner.forward(forward_batch)
+    next_token_ids = model_runner.sample(logits_output, forward_batch)
+    return next_token_ids, logits_output.next_token_logits
+def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
+    if require_mlp_sync(model_runner.server_args):
+        prepare_mlp_sync_batch_raw(
+            batch,
+            dp_size=model_runner.server_args.dp_size,
+            attn_tp_size=1,
+            tp_group=model_runner.tp_group,
+            get_idle_batch=None,
+            disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
+            require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
+            disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
+            offload_tags=set(),
+        )
+def _read_prompts_from_file(prompt_file, rank_print):
+    """Read custom prompts from the file specified by `--prompt-filename`."""
+    if not prompt_file:
+        return []
+    if not os.path.exists(prompt_file):
+        rank_print(
+            f"Custom prompt file {prompt_file} not found. Using default inputs..."
+        )
+        return []
+    with open(prompt_file, "r") as pf:
+        return pf.readlines()
+def _get_torch_profiler_output_dir():
+    return os.environ.get("SGLANG_TORCH_PROFILER_DIR", "/tmp")
+def _create_torch_profiler_filename(
+    profile_filename_prefix, batch_size, input_len, output_len, stage
+):
+    output_dir = _get_torch_profiler_output_dir()
+    filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_{stage}.trace.json.gz"
+    return os.path.join(output_dir, filename)
+def _save_profile_trace_results(profiler, filename):
+    parent_dir = os.path.dirname(os.path.abspath(filename))
+    os.makedirs(parent_dir, exist_ok=True)
+    profiler.export_chrome_trace(filename)
+    print(
+        profiler.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cpu_time_total"
+        )
+    )
+def correctness_test(
+    server_args,
+    port_args,
+    bench_args,
+    gpu_id,
+    tp_rank,
+):
+    # Configure the logger
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
+    # Load the model
+    model_runner, tokenizer = load_model(server_args, port_args, gpu_id, tp_rank)
+    # Prepare inputs
+    custom_prompts = _read_prompts_from_file(bench_args.prompt_filename, rank_print)
+    input_ids, reqs = prepare_inputs_for_correctness_test(
+        bench_args, tokenizer, custom_prompts
+    )
+    rank_print(f"\n{input_ids=}\n")
+    if bench_args.cut_len > 0:
+        # Prefill
+        next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
+        rank_print(f"prefill logits (first half): {next_token_logits} \n")
+    # Prepare extend inputs
+    reqs = prepare_extend_inputs_for_correctness_test(
+        bench_args, input_ids, reqs, model_runner
+    )
+    # Extend (prefill w/ KV cache)
+    next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
+    rank_print(f"prefill logits (final): {next_token_logits} \n")
+    # Decode
+    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
+    for _ in range(bench_args.output_len[0] - 1):
+        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
+        next_token_ids_list = next_token_ids.tolist()
+        for i in range(len(reqs)):
+            output_ids[i].append(next_token_ids_list[i])
+    # Print output texts
+    for i in range(len(reqs)):
+        rank_print(f"========== Prompt {i} ==========")
+        rank_print(tokenizer.decode(output_ids[i]), "\n")
+def synchronize(device):
+    torch.get_device_module(device).synchronize()
+def latency_test_run_once(
+    run_name,
+    model_runner,
+    rank_print,
+    reqs,
+    batch_size,
+    input_len,
+    output_len,
+    device,
+    log_decode_step,
+    profile,
+    profile_record_shapes,
+    profile_activities,
+    profile_filename_prefix,
+    profile_stage,
+    tp_rank,
+):
+    max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
+    if batch_size > max_batch_size:
+        rank_print(
+            f"skipping ({batch_size}, {input_len}, {output_len}) due to max batch size limit"
+        )
+        return
+    model_runner.req_to_token_pool.clear()
+    model_runner.token_to_kv_pool_allocator.clear()
+    measurement_results = {
+        "run_name": run_name,
+        "batch_size": batch_size,
+        "input_len": input_len,
+        "output_len": output_len,
+    }
+    tot_latency = 0
+    profiler = None
+    enable_profile_prefill = profile and profile_stage in ["all", "prefill"]
+    if enable_profile_prefill:
+        profiler = start_profile(
+            profile_activities,
+            profile_record_shapes=profile_record_shapes,
+            rank_print=rank_print,
+        )
+    synchronize(device)
+    tic = time.perf_counter()
+    next_token_ids, _, batch = extend(reqs, model_runner)
+    synchronize(device)
+    prefill_latency = time.perf_counter() - tic
+    if enable_profile_prefill:
+        trace_filename = _create_torch_profiler_filename(
+            profile_filename_prefix, batch_size, input_len, output_len, "prefill"
+        )
+        stop_profile(
+            profiler,
+            profile_activities,
+            rank_print=rank_print,
+            save_trace=True,
+            trace_filename=trace_filename,
+            stage="prefill",
+        )
+    tot_latency += prefill_latency
+    throughput = input_len * batch_size / prefill_latency
+    rank_print(
+        f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+    )
+    measurement_results["prefill_latency"] = prefill_latency
+    measurement_results["prefill_throughput"] = throughput
+    decode_latencies = []
+    profile_step_of_interest = output_len // 2
+    enable_profile_decode = profile and profile_stage in ["all", "decode"]
+    for i in range(output_len - 1):
+        synchronize(device)
+        profiler = None
+        if enable_profile_decode and i == profile_step_of_interest:
+            profiler = start_profile(
+                profile_activities,
+                profile_record_shapes=profile_record_shapes,
+                rank_print=rank_print,
+            )
+        tic = time.perf_counter()
+        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
+        synchronize(device)
+        latency = time.perf_counter() - tic
+        if enable_profile_decode and i == profile_step_of_interest:
+            trace_filename = _create_torch_profiler_filename(
+                profile_filename_prefix, batch_size, input_len, output_len, "decode"
+            )
+            stop_profile(
+                profiler,
+                profile_activities,
+                rank_print=rank_print,
+                save_trace=True,
+                trace_filename=trace_filename,
+                stage="decode",
+            )
+        tot_latency += latency
+        throughput = batch_size / latency
+        decode_latencies.append(latency)
+        if i < 5 or (log_decode_step > 0 and i % log_decode_step == 0):
+            rank_print(
+                f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+            )
+    # Record decode timing from 2nd output
+    if output_len > 1:
+        med_decode_latency = np.median(decode_latencies)
+        med_decode_throughput = batch_size / med_decode_latency
+        rank_print(
+            f"Decode.  median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
+        )
+        measurement_results["median_decode_latency"] = med_decode_latency
+        measurement_results["median_decode_throughput"] = med_decode_throughput
+    throughput = (input_len + output_len) * batch_size / tot_latency
+    rank_print(
+        f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
+    )
+    measurement_results["total_latency"] = tot_latency
+    measurement_results["overall_throughput"] = throughput
+    return measurement_results
+def latency_test(
+    server_args,
+    port_args,
+    bench_args,
+    gpu_id,
+    tp_rank,
+):
+    initialize_moe_config(server_args)
+    # Set CPU affinity
+    if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
+        set_gpu_proc_affinity(
+            server_args.pp_size, server_args.tp_size, server_args.nnodes, tp_rank
+        )
+    # Configure the logger
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
+    # Load the model
+    model_runner, tokenizer = load_model(server_args, port_args, gpu_id, tp_rank)
+    # Prepare inputs for warm up
+    reqs = prepare_synthetic_inputs_for_latency_test(
+        bench_args.batch_size[0], bench_args.input_len[0]
+    )
+    # Warm up
+    rank_print("Warmup ...")
+    latency_test_run_once(
+        bench_args.run_name,
+        model_runner,
+        rank_print,
+        reqs,
+        bench_args.batch_size[0],
+        bench_args.input_len[0],
+        min(32, bench_args.output_len[0]),  # shorter decoding to speed up the warmup
+        server_args.device,
+        log_decode_step=0,
+        profile=False,
+        profile_record_shapes=False,
+        profile_activities=("CPU", "GPU"),
+        profile_filename_prefix="",
+        profile_stage="all",
+        tp_rank=tp_rank,
+    )
+    rank_print("Benchmark ...")
+    custom_inputs = _read_prompts_from_file(bench_args.prompt_filename, rank_print)
+    custom_inputs = [tokenizer.encode(p.strip()) for p in custom_inputs]
+    custom_input_len = len(custom_inputs)
+    # Run the sweep
+    result_list = []
+    for bs, il, ol in itertools.product(
+        bench_args.batch_size, bench_args.input_len, bench_args.output_len
+    ):
+        bs_aligned_inputs = []
+        if custom_inputs:
+            if custom_input_len == bs:
+                bs_aligned_inputs = custom_inputs
+            elif custom_input_len > bs:
+                rank_print(
+                    f"Custom input size ({custom_input_len}) is larger than batch_size ({bs}). "
+                    f"Using the first {bs} prompts."
+                )
+                bs_aligned_inputs = copy.deepcopy(custom_inputs[:bs])
+            else:
+                rank_print(
+                    f"Custom input size ({custom_input_len}) is smaller than batch_size ({bs}). "
+                    f"Pad to the desired batch_size with the last prompt."
+                )
+                bs_aligned_inputs = copy.deepcopy(custom_inputs)
+                bs_aligned_inputs.extend(
+                    [bs_aligned_inputs[-1]] * (bs - custom_input_len)
+                )
+        reqs = prepare_synthetic_inputs_for_latency_test(bs, il, bs_aligned_inputs)
+        ret = latency_test_run_once(
+            bench_args.run_name,
+            model_runner,
+            rank_print,
+            reqs,
+            bs,
+            il,
+            ol,
+            server_args.device,
+            bench_args.log_decode_step,
+            bench_args.profile if tp_rank == 0 else None,
+            bench_args.profile_record_shapes if tp_rank == 0 else None,
+            bench_args.profile_activities,
+            bench_args.profile_filename_prefix,
+            bench_args.profile_stage,
+            tp_rank,
+        )
+        if ret is not None:
+            result_list.append(ret)
+    # Write results in jsonlines format on rank 0.
+    if tp_rank == 0 and bench_args.result_filename:
+        with open(bench_args.result_filename, "a") as fout:
+            for result in result_list:
+                fout.write(json.dumps(result) + "\n")
+    if server_args.tp_size > 1:
+        destroy_distributed_environment()
+def main(server_args, bench_args):
+    server_args.cuda_graph_max_bs = max(bench_args.batch_size)
+    _set_envs_and_config(server_args)
+    if server_args.model_path:
+        if bench_args.correctness_test:
+            work_func = correctness_test
+        else:
+            work_func = latency_test
+    else:
+        raise ValueError(
+            "Provide --model-path for running the tests or "
+            "provide --result-filename for plotting the results"
+        )
+    port_args = PortArgs.init_new(server_args)
+    if server_args.tp_size == 1:
+        work_func(server_args, port_args, bench_args, 0, 0)
+    else:
+        workers = []
+        for tp_rank in range(server_args.tp_size):
+            with maybe_reindex_device_id(tp_rank) as gpu_id:
+                proc = multiprocessing.Process(
+                    target=work_func,
+                    args=(
+                        server_args,
+                        port_args,
+                        bench_args,
+                        gpu_id,
+                        tp_rank,
+                    ),
+                )
+                proc.start()
+                workers.append(proc)
+        for proc in workers:
+            proc.join()
+        proc.terminate()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format="%(message)s",
+    )
+    try:
+        main(server_args, bench_args)
+    finally:
+        if server_args.tp_size != 1:
+            kill_process_tree(os.getpid(), include_parent=False)

sglang/bench_one_batch_server.py ADDED Viewed

	@@ -0,0 +1,605 @@

+"""
+Benchmark the latency of running a single batch with a server.
+This script launches a server and uses the HTTP interface.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
+Usage:
+python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --output-path results.json --profile
+"""
+import argparse
+import dataclasses
+import itertools
+import json
+import multiprocessing
+import os
+import random
+import time
+from typing import List, Optional, Tuple
+import numpy as np
+import requests
+from pydantic import BaseModel
+from transformers import AutoProcessor, PreTrainedTokenizer
+from sglang.bench_serving import (
+    get_processor,
+    get_tokenizer,
+    sample_mmmu_requests,
+    sample_random_requests,
+)
+from sglang.profiler import run_profile
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import is_blackwell, kill_process_tree
+from sglang.test.nightly_bench_utils import save_results_as_pydantic_models
+from sglang.test.test_utils import is_in_ci, write_github_step_summary
+@dataclasses.dataclass
+class BenchArgs:
+    run_name: str = "default"
+    batch_size: Tuple[int] = (1,)
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (16,)
+    temperature: float = 0.0
+    return_logprob: bool = False
+    client_stream_interval: int = 1
+    input_len_step_percentage: float = 0.0
+    base_url: str = ""
+    skip_warmup: bool = False
+    show_report: bool = False
+    profile: bool = False
+    profile_steps: int = 5
+    profile_by_stage: bool = False
+    profile_prefix: Optional[str] = None
+    profile_output_dir: Optional[str] = None
+    dataset_path: str = ""
+    dataset_name: str = "random"
+    parallel_batch: bool = False
+    result_filename: str = "result.jsonl"
+    pydantic_result_filename: Optional[str] = None
+    append_to_github_summary: bool = True
+    seed: int = 42
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
+        parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
+        parser.add_argument("--return-logprob", action="store_true")
+        parser.add_argument(
+            "--client-stream-interval",
+            type=int,
+            default=BenchArgs.client_stream_interval,
+        )
+        parser.add_argument(
+            "--input-len-step-percentage",
+            type=float,
+            default=BenchArgs.input_len_step_percentage,
+        )
+        parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
+        parser.add_argument("--skip-warmup", action="store_true")
+        parser.add_argument("--show-report", action="store_true")
+        parser.add_argument("--profile", action="store_true")
+        parser.add_argument(
+            "--profile-steps", type=int, default=BenchArgs.profile_steps
+        )
+        parser.add_argument("--profile-by-stage", action="store_true")
+        parser.add_argument(
+            "--profile-prefix",
+            type=str,
+            default=BenchArgs.profile_prefix,
+        )
+        parser.add_argument(
+            "--profile-output-dir",
+            type=str,
+            default=BenchArgs.profile_output_dir,
+        )
+        parser.add_argument(
+            "--dataset-path",
+            type=str,
+            default=BenchArgs.dataset_path,
+            help="Path to the dataset.",
+        )
+        parser.add_argument(
+            "--dataset-name",
+            type=str,
+            default=BenchArgs.dataset_name,
+            choices=["mmmu", "random"],
+            help="Name of the dataset to benchmark on.",
+        )
+        parser.add_argument("--parallel-batch", action="store_true")
+        parser.add_argument(
+            "--result-filename",
+            type=str,
+            default=BenchArgs.result_filename,
+            help="Store the results line by line in the JSON Line format to this file.",
+        )
+        parser.add_argument(
+            "--pydantic-result-filename",
+            type=str,
+            default=BenchArgs.pydantic_result_filename,
+            help="Store the results as pydantic models in the JSON format to this file.",
+        )
+        parser.add_argument(
+            "--no-append-to-github-summary",
+            action="store_false",
+            dest="append_to_github_summary",
+            help="Disable appending the output of this run to github ci summary",
+        )
+        parser.add_argument("--seed", type=int, default=BenchArgs.seed)
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+class BenchOneCaseResult(BaseModel):
+    run_name: str
+    batch_size: int
+    input_len: int
+    output_len: int
+    latency: float
+    input_throughput: float
+    output_throughput: float
+    overall_throughput: float
+    last_ttft: float
+    last_gen_throughput: float
+    acc_length: float
+    profile_link: Optional[str] = None
+    def dump_to_jsonl(self, result_filename: str):
+        with open(result_filename, "a") as fout:
+            res = {
+                "run_name": self.run_name,
+                "batch_size": self.batch_size,
+                "input_len": self.input_len,
+                "output_len": self.output_len,
+                "latency": round(self.latency, 4),
+                "input_throughput": round(self.input_throughput, 2),
+                "output_throughput": round(self.output_throughput, 2),
+                "overall_throughput": round(self.overall_throughput, 2),
+                "last_ttft": round(self.last_ttft, 4),
+                "last_gen_throughput": round(self.last_gen_throughput, 2),
+                "acc_length": round(self.acc_length, 2),
+            }
+            fout.write(json.dumps(res) + "\n")
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_process_tree(os.getpid(), include_parent=False)
+def launch_server_process(server_args: ServerArgs):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = 600
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            response = requests.get(f"{base_url}/v1/models", headers=headers)
+            if response.status_code == 200:
+                return proc, base_url
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError("Server failed to start within the timeout period.")
+def run_one_case(
+    url: str,
+    batch_size: int,
+    input_len: int,
+    output_len: int,
+    temperature: float,
+    return_logprob: bool,
+    stream_interval: int,
+    input_len_step_percentage: float,
+    run_name: str,
+    result_filename: str,
+    tokenizer: PreTrainedTokenizer | AutoProcessor,
+    profile: bool = False,
+    profile_steps: int = BenchArgs.profile_steps,
+    profile_by_stage: bool = False,
+    profile_prefix: Optional[str] = BenchArgs.profile_prefix,
+    profile_output_dir: Optional[str] = BenchArgs.profile_output_dir,
+    dataset_name: str = BenchArgs.dataset_name,
+    dataset_path: str = BenchArgs.dataset_path,
+    parallel_batch: bool = False,
+):
+    requests.post(url + "/flush_cache")
+    # Load input token ids
+    # TODO: reuse bench_serving.get_dataset ?
+    if dataset_name == "mmmu":
+        input_requests = sample_mmmu_requests(
+            num_requests=batch_size,
+            processor=tokenizer,
+            fixed_output_len=output_len,
+            random_sample=False,
+        )
+    elif dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=input_len,
+            output_len=output_len,
+            num_prompts=batch_size,
+            range_ratio=1.0,
+            tokenizer=tokenizer,
+            dataset_path=dataset_path,
+            random_sample=True,
+            return_text=False,
+        )
+    # Load sampling parameters
+    use_structured_outputs = False
+    if use_structured_outputs:
+        texts = []
+        for _ in range(batch_size):
+            texts.append(
+                "Human: What is the capital city of france? can you give as many trivial information as possible about that city? answer in json.\n"
+                * 50
+                + "Assistant:"
+            )
+        json_schema = "$$ANY$$"
+    else:
+        json_schema = None
+    payload = {
+        "sampling_params": {
+            "temperature": temperature,
+            "max_new_tokens": output_len,
+            "ignore_eos": True,
+            "json_schema": json_schema,
+            "stream_interval": stream_interval,
+        },
+        "return_logprob": return_logprob,
+        "stream": True,
+        **({"parallel_batch": parallel_batch} if parallel_batch else {}),
+    }
+    if dataset_name == "mmmu":
+        # vlm
+        input_ids = []
+        # for vlms, tokenizer is an instance of AutoProcessor
+        tokenizer = tokenizer.tokenizer
+        for input_req in input_requests:
+            input_ids += [tokenizer.encode(input_req.prompt)]
+        payload["image_data"] = [req.image_data for req in input_requests]
+    else:
+        input_ids = [req.prompt for req in input_requests]
+    payload["input_ids"] = input_ids
+    # Turn on profiler
+    profile_link = None
+    if profile:
+        profile_link: str = run_profile(
+            url=url,
+            num_steps=profile_steps,
+            activities=["CPU", "GPU"],
+            output_dir=profile_output_dir,
+            profile_by_stage=profile_by_stage,
+            profile_prefix=profile_prefix,
+        )
+    # Run the request
+    tic = time.perf_counter()
+    response = requests.post(
+        url + "/generate",
+        json=payload,
+        stream=True,
+    )
+    # Get the TTFT of the last request in the batch
+    last_ttft = 0.0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            if "error" in data:
+                raise RuntimeError(f"Request has failed. {data}.")
+            assert (
+                data["meta_info"]["finish_reason"] is None
+                or data["meta_info"]["finish_reason"]["type"] == "length"
+            )
+            if data["meta_info"]["completion_tokens"] == 1:
+                last_ttft = time.perf_counter() - tic
+    # Compute metrics
+    latency = time.perf_counter() - tic
+    input_throughput = batch_size * input_len / last_ttft
+    output_throughput = batch_size * output_len / (latency - last_ttft)
+    overall_throughput = batch_size * (input_len + output_len) / latency
+    server_info = requests.get(url + "/get_server_info").json()
+    internal_state = server_info.get("internal_states", [{}])
+    last_gen_throughput = internal_state[0].get("last_gen_throughput", None) or -1
+    acc_length = internal_state[0].get("avg_spec_accept_length", None) or -1
+    # Print results
+    print(f"batch size: {batch_size}")
+    print(f"input_len: {input_len}")
+    print(f"output_len: {output_len}")
+    print(f"latency: {latency:.2f} s")
+    print(f"input throughput: {input_throughput:.2f} tok/s")
+    if output_len != 1:
+        print(f"output throughput: {output_throughput:.2f} tok/s")
+    print(f"last_ttft: {last_ttft:.2f} s")
+    print(f"last generation throughput: {last_gen_throughput:.2f} tok/s")
+    if acc_length > 0:
+        print(f"acc_length: {acc_length:.2f} ")
+    # Dump results
+    result = BenchOneCaseResult(
+        run_name=run_name,
+        batch_size=batch_size,
+        input_len=input_len,
+        output_len=output_len,
+        latency=latency,
+        input_throughput=input_throughput,
+        output_throughput=output_throughput,
+        overall_throughput=overall_throughput,
+        last_ttft=last_ttft,
+        last_gen_throughput=last_gen_throughput,
+        acc_length=acc_length,
+        profile_link=profile_link,
+    )
+    # Save and return the results
+    if result_filename:
+        result.dump_to_jsonl(result_filename)
+    return result
+def should_skip_due_to_token_capacity(
+    batch_size, input_len, output_len, skip_token_capacity_threshold
+):
+    if batch_size * (input_len + output_len) > skip_token_capacity_threshold:
+        print(
+            "=" * 8
+            + f"Skip benchmark {batch_size=} * ({input_len=} + {output_len=}) = {batch_size * (input_len + output_len)} > {skip_token_capacity_threshold=} due to kv cache limit."
+            + "=" * 8
+        )
+        return True
+    return False
+def get_report_summary(
+    results: List[BenchOneCaseResult], bench_args: BenchArgs, server_args: ServerArgs
+):
+    summary = (
+        f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
+    )
+    summary += "| batch size | input len | latency (s) | input throughput (tok/s)  | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
+    if bench_args.profile:
+        summary += " profile |"
+    summary += "\n"
+    summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
+    if bench_args.profile:
+        summary += "-------------|"
+    summary += "\n"
+    if is_blackwell():
+        hourly_cost_per_gpu = 4  # $4/hour for one B200
+    else:
+        hourly_cost_per_gpu = 2  # $2/hour for one H100
+    input_util = 0.7
+    # sort result by input_len
+    results.sort(key=lambda x: x.input_len)
+    for res in results:
+        hourly_cost = hourly_cost_per_gpu * server_args.tp_size
+        accept_length = round(res.acc_length, 2) if res.acc_length > 0 else "n/a"
+        line = (
+            f"| {res.batch_size} | "
+            f"{res.input_len} | "
+            f"{res.latency:.2f} | "
+            f"{res.input_throughput:.2f} | "
+            f"{res.output_throughput:.2f} | "
+            f"{accept_length} | "
+            f"{1 / (res.output_throughput/res.batch_size) * 1000:.2f} | "
+            f"{1e6 / (res.input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
+            f"{1e6 / res.output_throughput / 3600 * hourly_cost:.2f} |"
+        )
+        if bench_args.profile:
+            if res.profile_link:
+                line += f" [Profile]({res.profile_link}) |"
+            else:
+                line += f" n/a |"
+        line += "\n"
+        summary += line
+    return summary
+def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
+    if bench_args.base_url:
+        proc, base_url = None, bench_args.base_url
+    else:
+        proc, base_url = launch_server_process(server_args)
+    # Get tokenizer
+    server_info = requests.get(base_url + "/get_server_info").json()
+    if "tokenizer_path" in server_info:
+        tokenizer_path = server_info["tokenizer_path"]
+    elif "prefill" in server_info:
+        tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
+    if bench_args.dataset_name == "mmmu":
+        # mmmu implies this is a MLLM
+        tokenizer = get_processor(tokenizer_path)
+    else:
+        tokenizer = get_tokenizer(tokenizer_path)
+    # Get token capacity
+    internal_state = server_info.get("internal_states", [{}])
+    skip_token_capacity_threshold = (
+        internal_state[0].get("memory_usage", {}).get("token_capacity", 1000000000)
+    )
+    # Warmup
+    if not bench_args.skip_warmup:
+        print("=" * 8 + " Warmup Begin " + "=" * 8)
+        print(f"Warmup with batch_size={bench_args.batch_size}")
+        for bs in bench_args.batch_size:
+            run_one_case(
+                base_url,
+                batch_size=bs,
+                input_len=1024,
+                output_len=16,
+                temperature=bench_args.temperature,
+                return_logprob=bench_args.return_logprob,
+                stream_interval=bench_args.client_stream_interval,
+                input_len_step_percentage=bench_args.input_len_step_percentage,
+                run_name="",
+                result_filename="",
+                tokenizer=tokenizer,
+                dataset_name=bench_args.dataset_name,
+                dataset_path=bench_args.dataset_path,
+                parallel_batch=bench_args.parallel_batch,
+            )
+        print("=" * 8 + " Warmup End   " + "=" * 8 + "\n")
+    results = []
+    profile_results = []
+    try:
+        # Benchmark all cases
+        for bs, il, ol in itertools.product(
+            bench_args.batch_size, bench_args.input_len, bench_args.output_len
+        ):
+            if should_skip_due_to_token_capacity(
+                bs, il, ol, skip_token_capacity_threshold
+            ):
+                continue
+            results.append(
+                run_one_case(
+                    base_url,
+                    bs,
+                    il,
+                    ol,
+                    temperature=bench_args.temperature,
+                    return_logprob=bench_args.return_logprob,
+                    stream_interval=bench_args.client_stream_interval,
+                    input_len_step_percentage=bench_args.input_len_step_percentage,
+                    run_name=bench_args.run_name,
+                    result_filename=bench_args.result_filename,
+                    tokenizer=tokenizer,
+                    dataset_name=bench_args.dataset_name,
+                    dataset_path=bench_args.dataset_path,
+                    parallel_batch=bench_args.parallel_batch,
+                )
+            )
+        # Profile all cases
+        if bench_args.profile:
+            try:
+                for bs, il, ol in itertools.product(
+                    bench_args.batch_size, bench_args.input_len, bench_args.output_len
+                ):
+                    if should_skip_due_to_token_capacity(
+                        bs, il, ol, skip_token_capacity_threshold
+                    ):
+                        continue
+                    profile_prefix = (
+                        bench_args.profile_prefix or ""
+                    ) + f"bs-{bs}-il-{il}"
+                    profile_results.append(
+                        run_one_case(
+                            base_url,
+                            bs,
+                            il,
+                            ol,
+                            temperature=bench_args.temperature,
+                            return_logprob=bench_args.return_logprob,
+                            stream_interval=bench_args.client_stream_interval,
+                            input_len_step_percentage=bench_args.input_len_step_percentage,
+                            run_name=bench_args.run_name,
+                            result_filename=bench_args.result_filename,
+                            tokenizer=tokenizer,
+                            dataset_name=bench_args.dataset_name,
+                            dataset_path=bench_args.dataset_path,
+                            parallel_batch=bench_args.parallel_batch,
+                            profile=bench_args.profile,
+                            profile_steps=bench_args.profile_steps,
+                            profile_by_stage=bench_args.profile_by_stage,
+                            profile_prefix=profile_prefix,
+                            profile_output_dir=bench_args.profile_output_dir,
+                        )
+                    )
+                # Replace the profile link
+                for res, profile_res in zip(results, profile_results):
+                    res.profile_link = profile_res.profile_link
+            except Exception as e:
+                print(f"Error profiling, there will be no profile trace dump: {e}")
+    finally:
+        if proc:
+            kill_process_tree(proc.pid)
+    print(f"\nResults are saved to {bench_args.result_filename}")
+    if not bench_args.show_report:
+        return
+    # Print summary
+    summary = get_report_summary(results, bench_args, server_args)
+    print(summary)
+    if is_in_ci() and bench_args.append_to_github_summary:
+        write_github_step_summary(summary)
+    else:
+        print(summary)
+    # Save results as pydantic models in the JSON format
+    if bench_args.pydantic_result_filename:
+        save_results_as_pydantic_models(
+            results,
+            pydantic_result_filename=bench_args.pydantic_result_filename,
+            model_path=server_args.model_path,
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+    run_benchmark(server_args, bench_args)

sglang/bench_serving.py ADDED Viewed

The diff for this file is too large to render. See raw diff

sglang/check_env.py ADDED Viewed

	@@ -0,0 +1,433 @@

+"""Check environment configurations and dependency versions."""
+import importlib.metadata
+import os
+import resource
+import subprocess
+import sys
+from abc import abstractmethod
+from collections import OrderedDict, defaultdict
+import torch
+from sglang.srt.utils import is_hip, is_npu
+def is_cuda_v2():
+    return torch.version.cuda is not None
+# List of packages to check versions
+PACKAGE_LIST = [
+    "sglang",
+    "sgl_kernel",
+    "flashinfer_python",
+    "flashinfer_cubin",
+    "flashinfer_jit_cache",
+    "triton",
+    "transformers",
+    "torchao",
+    "numpy",
+    "aiohttp",
+    "fastapi",
+    "hf_transfer",
+    "huggingface_hub",
+    "interegular",
+    "modelscope",
+    "orjson",
+    "outlines",
+    "packaging",
+    "psutil",
+    "pydantic",
+    "python-multipart",
+    "pyzmq",
+    "torchao",
+    "uvicorn",
+    "uvloop",
+    "vllm",
+    "xgrammar",
+    "openai",
+    "tiktoken",
+    "anthropic",
+    "litellm",
+    "decord2",
+]
+class BaseEnv:
+    """Base class for environment check"""
+    def __init__(self):
+        self.package_list = PACKAGE_LIST
+    @abstractmethod
+    def get_info(self) -> dict:
+        """
+        Get CUDA-related information if available.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get_topology(self) -> dict:
+        raise NotImplementedError
+    def get_package_versions(self) -> dict:
+        """
+        Get versions of specified packages.
+        """
+        versions = {}
+        for package in self.package_list:
+            package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
+            try:
+                version = importlib.metadata.version(package_name)
+                versions[package_name] = version
+            except ModuleNotFoundError:
+                versions[package_name] = "Module Not Found"
+        return versions
+    def get_device_info(self):
+        """
+        Get information about available GPU devices.
+        """
+        devices = defaultdict(list)
+        capabilities = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+            capability = torch.cuda.get_device_capability(k)
+            capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
+        gpu_info = {}
+        for name, device_ids in devices.items():
+            gpu_info[f"GPU {','.join(device_ids)}"] = name
+        if len(capabilities) == 1:
+            # All GPUs have the same compute capability
+            cap, gpu_ids = list(capabilities.items())[0]
+            gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
+        else:
+            # GPUs have different compute capabilities
+            for cap, gpu_ids in capabilities.items():
+                gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
+        return gpu_info
+    def get_hypervisor_vendor(self) -> dict:
+        try:
+            output = subprocess.check_output(["lscpu"], text=True)
+            for line in output.split("\n"):
+                if "Hypervisor vendor:" in line:
+                    return {"Hypervisor vendor:": line.split(":")[1].strip()}
+            return {}
+        except:
+            return {}
+    def get_ulimit_soft(self) -> dict:
+        ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
+        return {"ulimit soft": ulimit_soft}
+    def check_env(self):
+        """
+        Check and print environment information.
+        """
+        env_info = OrderedDict()
+        env_info["Python"] = sys.version.replace("\n", "")
+        env_info.update(self.get_info())
+        env_info["PyTorch"] = torch.__version__
+        env_info.update(self.get_package_versions())
+        env_info.update(self.get_topology())
+        env_info.update(self.get_hypervisor_vendor())
+        env_info.update(self.get_ulimit_soft())
+        for k, v in env_info.items():
+            print(f"{k}: {v}")
+class GPUEnv(BaseEnv):
+    """Environment checker for Nvidia GPU"""
+    def get_info(self):
+        cuda_info = {"CUDA available": torch.cuda.is_available()}
+        if cuda_info["CUDA available"]:
+            cuda_info.update(self.get_device_info())
+            cuda_info.update(self._get_cuda_version_info())
+        return cuda_info
+    def _get_cuda_version_info(self):
+        """
+        Get CUDA version information.
+        """
+        from torch.utils.cpp_extension import CUDA_HOME
+        cuda_info = {"CUDA_HOME": CUDA_HOME}
+        if CUDA_HOME and os.path.isdir(CUDA_HOME):
+            cuda_info.update(self._get_nvcc_info())
+            cuda_info.update(self._get_cuda_driver_version())
+        return cuda_info
+    def _get_nvcc_info(self):
+        """
+        Get NVCC version information.
+        """
+        from torch.utils.cpp_extension import CUDA_HOME
+        try:
+            nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
+            nvcc_output = (
+                subprocess.check_output(f'"{nvcc}" -V', shell=True)
+                .decode("utf-8")
+                .strip()
+            )
+            return {
+                "NVCC": nvcc_output[
+                    nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
+                        "Build"
+                    )
+                ].strip()
+            }
+        except subprocess.SubprocessError:
+            return {"NVCC": "Not Available"}
+    def _get_cuda_driver_version(self):
+        """
+        Get CUDA driver version.
+        """
+        versions = set()
+        try:
+            output = subprocess.check_output(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=driver_version",
+                    "--format=csv,noheader,nounits",
+                ]
+            )
+            versions = set(output.decode().strip().split("\n"))
+            if len(versions) == 1:
+                return {"CUDA Driver Version": versions.pop()}
+            else:
+                return {"CUDA Driver Versions": ", ".join(sorted(versions))}
+        except subprocess.SubprocessError:
+            return {"CUDA Driver Version": "Not Available"}
+    def get_topology(self):
+        """
+        Get GPU topology information.
+        """
+        try:
+            result = subprocess.run(
+                ["nvidia-smi", "topo", "-m"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
+            )
+            return {
+                "NVIDIA Topology": (
+                    "\n" + result.stdout if result.returncode == 0 else None
+                )
+            }
+        except subprocess.SubprocessError:
+            return {}
+class HIPEnv(BaseEnv):
+    """Environment checker for ROCm/HIP"""
+    def get_info(self):
+        cuda_info = {"ROCM available": torch.cuda.is_available()}
+        if cuda_info["ROCM available"]:
+            cuda_info.update(self.get_device_info())
+            cuda_info.update(self._get_cuda_version_info())
+        return cuda_info
+    def _get_cuda_version_info(self):
+        from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
+        cuda_info = {"ROCM_HOME": ROCM_HOME}
+        if ROCM_HOME and os.path.isdir(ROCM_HOME):
+            cuda_info.update(self._get_hipcc_info())
+            cuda_info.update(self._get_rocm_driver_version())
+        return cuda_info
+    def _get_hipcc_info(self):
+        from torch.utils.cpp_extension import ROCM_HOME
+        try:
+            hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
+            hipcc_output = (
+                subprocess.check_output(f'"{hipcc}" --version', shell=True)
+                .decode("utf-8")
+                .strip()
+            )
+            return {
+                "HIPCC": hipcc_output[
+                    hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
+                ].strip()
+            }
+        except subprocess.SubprocessError:
+            return {"HIPCC": "Not Available"}
+    def _get_rocm_driver_version(self):
+        try:
+            output = subprocess.check_output(
+                [
+                    "rocm-smi",
+                    "--showdriverversion",
+                    "--csv",
+                ]
+            )
+            versions = set(output.decode().strip().split("\n"))
+            versions.discard("name, value")
+            ver = versions.pop()
+            ver = ver.replace('"Driver version", ', "").replace('"', "")
+            return {"ROCM Driver Version": ver}
+        except subprocess.SubprocessError:
+            return {"ROCM Driver Version": "Not Available"}
+    def get_topology(self):
+        try:
+            result = subprocess.run(
+                ["rocm-smi", "--showtopotype"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
+            )
+            return {
+                "AMD Topology": "\n" + result.stdout if result.returncode == 0 else None
+            }
+        except subprocess.SubprocessError:
+            return {}
+class NPUEnv(BaseEnv):
+    """Environment checker for Ascend NPU"""
+    EXTRA_PACKAGE_LIST = [
+        "torch_npu",
+        "sgl-kernel-npu",
+        "deep_ep",
+    ]
+    def __init__(self):
+        super().__init__()
+        self.package_list.extend(NPUEnv.EXTRA_PACKAGE_LIST)
+    def get_info(self):
+        cuda_info = {"NPU available": torch.npu.is_available()}
+        if cuda_info["NPU available"]:
+            cuda_info.update(self.get_device_info())
+            cuda_info.update(self._get_cann_version_info())
+        return cuda_info
+    def get_device_info(self):
+        """
+        Get information about available NPUs.
+        Need to override due to torch_npu interface differences.
+        """
+        devices = defaultdict(list)
+        for k in range(torch.npu.device_count()):
+            devices[torch.npu.get_device_name(k)].append(str(k))
+        npu_info = {}
+        for name, device_ids in devices.items():
+            npu_info[f"NPU {','.join(device_ids)}"] = name
+        return npu_info
+    def _get_cann_version_info(self):
+        cann_envs = ["ASCEND_TOOLKIT_HOME", "ASCEND_INSTALL_PATH"]
+        for var in cann_envs:
+            path = os.environ.get(var)
+            if path and os.path.exists(path):
+                CANN_HOME = path
+                break
+        else:
+            default_path = "/usr/local/Ascend/ascend-toolkit/latest"
+            CANN_HOME = default_path if os.path.exists(default_path) else None
+        if CANN_HOME:
+            npu_info = {"CANN_HOME": CANN_HOME}
+            npu_info.update(self._get_cann_info(CANN_HOME))
+            npu_info.update(self._get_ascend_driver_version())
+            return npu_info
+        else:
+            return {"CANN_HOME": "Not found"}
+    def _get_cann_info(self, CANN_HOME: str):
+        cann_info = {}
+        cann_version_file = os.path.join(CANN_HOME, "version.cfg")
+        if os.path.exists(cann_version_file):
+            with open(cann_version_file, "r", encoding="utf-8") as f:
+                f.readline()  # discard first line comment in version.cfg
+                cann_info["CANN"] = f.readline().split("[")[1].split("]")[0]
+        else:
+            cann_info["CANN"] = "Not Available"
+        try:
+            bisheng = os.path.join(CANN_HOME, "compiler/ccec_compiler/bin/bisheng")
+            bisheng_output = (
+                subprocess.check_output([bisheng, "--version"]).decode("utf-8").strip()
+            )
+            cann_info["BiSheng"] = bisheng_output.split("\n")[0].strip()
+        except subprocess.SubprocessError:
+            cann_info["BiSheng"] = "Not Available"
+        return cann_info
+    def _get_ascend_driver_version(self):
+        try:
+            output = subprocess.check_output(
+                [
+                    "npu-smi",
+                    "info",
+                    "-t",
+                    "board",
+                    "-i",
+                    "0",
+                ]
+            )
+            for line in output.decode().strip().split("\n"):
+                if "Software Version" in line:
+                    version = line.split(":")[-1].strip()
+                    break
+            else:
+                version = "Not Available"
+            return {"Ascend Driver Version": version}
+        except subprocess.SubprocessError:
+            return {"Ascend Driver Version": "Not Available"}
+    def get_topology(self):
+        try:
+            result = subprocess.run(
+                ["npu-smi", "info", "-t", "topo"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
+            )
+            return {
+                "Ascend Topology": (
+                    "\n" + result.stdout if result.returncode == 0 else None
+                )
+            }
+        except subprocess.SubprocessError:
+            return {}
+if __name__ == "__main__":
+    if is_cuda_v2():
+        env = GPUEnv()
+    elif is_hip():
+        env = HIPEnv()
+    elif is_npu():
+        env = NPUEnv()
+    env.check_env()

sglang/cli/__init__.py ADDED Viewed

File without changes

sglang/cli/generate.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import argparse
+from sglang.cli.utils import get_is_diffusion_model, get_model_path
+def generate(args, extra_argv):
+    # If help is requested, show generate subcommand help without requiring --model-path
+    if any(h in extra_argv for h in ("-h", "--help")):
+        from sglang.multimodal_gen.runtime.entrypoints.cli.generate import (
+            add_multimodal_gen_generate_args,
+        )
+        parser = argparse.ArgumentParser(description="SGLang Multimodal Generation")
+        add_multimodal_gen_generate_args(parser)
+        parser.parse_args(extra_argv)
+        return
+    model_path = get_model_path(extra_argv)
+    is_diffusion_model = get_is_diffusion_model(model_path)
+    if is_diffusion_model:
+        from sglang.multimodal_gen.runtime.entrypoints.cli.generate import (
+            add_multimodal_gen_generate_args,
+            generate_cmd,
+        )
+        parser = argparse.ArgumentParser(description="SGLang Multimodal Generation")
+        add_multimodal_gen_generate_args(parser)
+        parsed_args = parser.parse_args(extra_argv)
+        generate_cmd(parsed_args)
+    else:
+        raise Exception(
+            f"Generate subcommand is not yet supported for model: {model_path}"
+        )

sglang/cli/main.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import argparse
+from sglang.cli.generate import generate
+from sglang.cli.serve import serve
+def main():
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest="subcommand", required=True)
+    serve_parser = subparsers.add_parser(
+        "serve",
+        help="Launch the SGLang server.",
+        add_help=False,  # Defer help to the specific parser
+    )
+    serve_parser.set_defaults(func=serve)
+    generate_parser = subparsers.add_parser(
+        "generate",
+        help="Run inference on a multimodal model.",
+        add_help=False,  # Defer help to the specific parser
+    )
+    generate_parser.set_defaults(func=generate)
+    args, extra_argv = parser.parse_known_args()
+    args.func(args, extra_argv)

sglang/cli/serve.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import logging
+import os
+from sglang.cli.utils import get_is_diffusion_model, get_model_path
+from sglang.srt.utils import kill_process_tree
+logger = logging.getLogger(__name__)
+def serve(args, extra_argv):
+    if any(h in extra_argv for h in ("-h", "--help")):
+        # Since the server type is determined by the model, and we don't have a model path,
+        # we can't show the exact help. Instead, we show a general help message and then
+        # the help for both possible server types.
+        print(
+            "Usage: sglang serve --model-path <model-name-or-path> [additional-arguments]\n"
+        )
+        print(
+            "This command can launch either a standard language model server or a diffusion model server."
+        )
+        print("The server type is determined by the model path.\n")
+        print("For specific arguments, please provide a model_path.")
+        print("\n--- Help for Standard Language Model Server ---")
+        from sglang.srt.server_args import prepare_server_args
+        try:
+            prepare_server_args(["--help"])
+        except SystemExit:
+            pass  # argparse --help calls sys.exit
+        print("\n--- Help for Diffusion Model Server ---")
+        from sglang.multimodal_gen.runtime.entrypoints.cli.serve import (
+            add_multimodal_gen_serve_args,
+        )
+        parser = argparse.ArgumentParser(description="SGLang Diffusion Model Serving")
+        add_multimodal_gen_serve_args(parser)
+        parser.print_help()
+        return
+    model_path = get_model_path(extra_argv)
+    try:
+        is_diffusion_model = get_is_diffusion_model(model_path)
+        if is_diffusion_model:
+            logger.info("Diffusion model detected")
+        if is_diffusion_model:
+            # Logic for Diffusion Models
+            from sglang.multimodal_gen.runtime.entrypoints.cli.serve import (
+                add_multimodal_gen_serve_args,
+                execute_serve_cmd,
+            )
+            parser = argparse.ArgumentParser(
+                description="SGLang Diffusion Model Serving"
+            )
+            add_multimodal_gen_serve_args(parser)
+            parsed_args, remaining_argv = parser.parse_known_args(extra_argv)
+            execute_serve_cmd(parsed_args, remaining_argv)
+        else:
+            # Logic for Standard Language Models
+            from sglang.launch_server import run_server
+            from sglang.srt.server_args import prepare_server_args
+            # Add a dummy argument for the program name, expected by prepare_server_args
+            # as it typically processes sys.argv
+            server_args = prepare_server_args(extra_argv)
+            run_server(server_args)
+    finally:
+        kill_process_tree(os.getpid(), include_parent=False)

sglang/cli/utils.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import hashlib
+import json
+import logging
+import os
+import tempfile
+from typing import Optional
+import filelock
+from huggingface_hub import hf_hub_download
+logger = logging.getLogger(__name__)
+temp_dir = tempfile.gettempdir()
+def _get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
+    lock_dir = cache_dir or temp_dir
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    lock_file_name = hash_name + model_name + ".lock"
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), mode=0o666)
+    return lock
+# Copied and adapted from hf_diffusers_utils.py
+def _maybe_download_model(
+    model_name_or_path: str, local_dir: str | None = None, download: bool = True
+) -> str:
+    """
+    Resolve a model path. If it's a local directory, return it.
+    If it's a Hugging Face Hub ID, download only the config file
+    (`model_index.json` or `config.json`) and return its directory.
+    Args:
+        model_name_or_path: Local path or Hugging Face Hub model ID
+        local_dir: Local directory to save the downloaded file (if any)
+        download: Whether to download from Hugging Face Hub when needed
+    Returns:
+        Local directory path that contains the downloaded config file, or the original local directory.
+    """
+    if os.path.exists(model_name_or_path):
+        logger.info("Model already exists locally")
+        return model_name_or_path
+    if not download:
+        return model_name_or_path
+    with _get_lock(model_name_or_path):
+        # Try `model_index.json` first (diffusers models)
+        try:
+            logger.info(
+                "Downloading model_index.json from HF Hub for %s...",
+                model_name_or_path,
+            )
+            file_path = hf_hub_download(
+                repo_id=model_name_or_path,
+                filename="model_index.json",
+                local_dir=local_dir,
+            )
+            logger.info("Downloaded to %s", file_path)
+            return os.path.dirname(file_path)
+        except Exception as e_index:
+            logger.debug("model_index.json not found or failed: %s", e_index)
+        # Fallback to `config.json`
+        try:
+            logger.info(
+                "Downloading config.json from HF Hub for %s...", model_name_or_path
+            )
+            file_path = hf_hub_download(
+                repo_id=model_name_or_path,
+                filename="config.json",
+                local_dir=local_dir,
+            )
+            logger.info("Downloaded to %s", file_path)
+            return os.path.dirname(file_path)
+        except Exception as e_config:
+            raise ValueError(
+                (
+                    "Could not find model locally at %s and failed to download "
+                    "model_index.json/config.json from HF Hub: %s"
+                )
+                % (model_name_or_path, e_config)
+            ) from e_config
+# Copied and adapted from hf_diffusers_utils.py
+def is_diffusers_model_path(model_path: str) -> True:
+    """
+    Verify if the model directory contains a valid diffusers configuration.
+    Args:
+        model_path: Path to the model directory
+    Returns:
+        The loaded model configuration as a dictionary if the model is a diffusers model
+        None if the model is not a diffusers model
+    """
+    # Prefer model_index.json which indicates a diffusers pipeline
+    config_path = os.path.join(model_path, "model_index.json")
+    if not os.path.exists(config_path):
+        return False
+    # Load the config
+    with open(config_path) as f:
+        config = json.load(f)
+    # Verify diffusers version exists
+    if "_diffusers_version" not in config:
+        return False
+    return True
+def get_is_diffusion_model(model_path: str):
+    model_path = _maybe_download_model(model_path)
+    is_diffusion_model = is_diffusers_model_path(model_path)
+    if is_diffusion_model:
+        logger.info("Diffusion model detected")
+    return is_diffusion_model
+def get_model_path(extra_argv):
+    # Find the model_path argument
+    model_path = None
+    for i, arg in enumerate(extra_argv):
+        if arg == "--model-path":
+            if i + 1 < len(extra_argv):
+                model_path = extra_argv[i + 1]
+                break
+        elif arg.startswith("--model-path="):
+            model_path = arg.split("=", 1)[1]
+            break
+    if model_path is None:
+        # Fallback for --help or other cases where model-path is not provided
+        if any(h in extra_argv for h in ["-h", "--help"]):
+            raise Exception(
+                "Usage: sglang serve --model-path <model-name-or-path> [additional-arguments]\n\n"
+                "This command can launch either a standard language model server or a diffusion model server.\n"
+                "The server type is determined by the model path.\n"
+                "For specific arguments, please provide a model_path."
+            )
+        else:
+            raise Exception(
+                "Error: --model-path is required. "
+                "Please provide the path to the model."
+            )
+    return model_path

sglang/compile_deep_gemm.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Compile DeepGEMM Kernels for a model with specify server arguments
+This script launches a server for capturing DeepGEMM calls and then compiles the kernels.
+It accepts server arguments (the same as launch_server.py).
+Usage:
+python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
+"""
+import argparse
+import dataclasses
+import multiprocessing
+import os
+import time
+import requests
+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.entrypoints.warmup import warmup
+from sglang.srt.environ import envs
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_process_tree
+multiprocessing.set_start_method("spawn", force=True)
+# Reduce warning
+envs.SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE.set(True)
+# Force enable deep gemm
+envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(True)
+# Force enable mha chunked kv for DeepSeek V3 to avoid missing kv_b_proj DeepGEMM case
+os.environ["SGL_CHUNKED_PREFIX_CACHE_THRESHOLD"] = "0"
+@dataclasses.dataclass
+class CompileArgs:
+    timeout: int = 3600
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--timeout", type=int, default=CompileArgs.timeout)
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to cast the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+@warmup("compile-deep-gemm")
+async def warm_up_compile(
+    disaggregation_mode: str, tokenizer_manager: TokenizerManager
+):
+    print("\nGenerate warm up request for compiling DeepGEMM...\n")
+    generate_req_input = GenerateReqInput(
+        input_ids=[0, 1, 2, 3],
+        sampling_params={
+            "temperature": 0.0,
+            "max_new_tokens": 8,
+            "ignore_eos": True,
+        },
+    )
+    if disaggregation_mode != "null":
+        generate_req_input.bootstrap_room = 0
+        generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
+    await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_process_tree(os.getpid(), include_parent=False)
+def launch_server_process_and_send_one_request(
+    server_args: ServerArgs, compile_args: CompileArgs
+):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = compile_args.timeout
+    start_time = time.perf_counter()
+    while time.perf_counter() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            if server_args.node_rank == 0:
+                response = requests.get(f"{base_url}/v1/models", headers=headers)
+            else:
+                # This http api is created by launch_dummy_health_check_server for none-rank0 node.
+                response = requests.get(f"{base_url}/health", headers=headers)
+            if response.status_code == 200:
+                # Rank-0 node send a request to sync with other node and then return.
+                if server_args.node_rank == 0:
+                    payload = {
+                        "input_ids": [0, 1, 2, 3],
+                        "sampling_params": {
+                            "max_new_tokens": 8,
+                            "temperature": 0,
+                        },
+                    }
+                    # In PD mode, include fake bootstrap fields so workers don't assert
+                    if server_args.disaggregation_mode != "null":
+                        payload["bootstrap_host"] = FAKE_BOOTSTRAP_HOST
+                        payload["bootstrap_room"] = 0
+                    response = requests.post(
+                        f"{base_url}/generate",
+                        json=payload,
+                        timeout=600,
+                    )
+                    if response.status_code != 200:
+                        error = response.json()
+                        raise RuntimeError(f"Sync request failed: {error}")
+                # Other nodes should wait for the exit signal from Rank-0 node.
+                else:
+                    start_time_waiting = time.perf_counter()
+                    while proc.is_alive():
+                        if time.perf_counter() - start_time_waiting < timeout:
+                            time.sleep(10)
+                        else:
+                            raise TimeoutError("Waiting for main node timeout!")
+                return proc
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError(
+        "DeepGEMM Kernels compilation timeout."
+        "\n\nFeel free and please restart the command."
+    )
+def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
+    # Disable cuda graph and torch compile to save time
+    server_args.disable_cuda_graph = True
+    server_args.enable_torch_compile = False
+    print(f"Disable CUDA Graph and Torch Compile to save time...")
+    # Set watchdog timeout to compile_args.timeout because compilation will take a long time
+    server_args.watchdog_timeout = compile_args.timeout
+    server_args.warmups = "compile-deep-gemm"
+def run_compile(server_args: ServerArgs, compile_args: CompileArgs):
+    print(
+        "Begin DeepGEMM Kernels compilation...\n"
+        "It may take a long time and timeout maybe raised "
+        "while the compilation is still in progress.\n"
+        "Just feel free to restart the command "
+        "until the compilation is fully finished.\n"
+    )
+    proc = launch_server_process_and_send_one_request(server_args, compile_args)
+    print("\nDeepGEMM Kernels compilation finished successfully.")
+    # Sleep for safety
+    time.sleep(10)
+    if proc.is_alive():
+        # This is the rank0 node.
+        kill_process_tree(proc.pid)
+    else:
+        try:
+            kill_process_tree(proc.pid)
+        except Exception:
+            pass
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    CompileArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    compile_args = CompileArgs.from_cli_args(args)
+    refine_server_args(server_args, compile_args)
+    run_compile(server_args, compile_args)

sglang/eval/llama3_eval.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Adapt from https://github.com/fw-ai/llm_eval_meta
+import argparse
+import asyncio
+import os
+import pickle
+import re
+import shutil
+from collections import defaultdict
+from dataclasses import dataclass
+import httpx
+import numpy as np
+import openai
+from datasets import load_dataset
+from openai import AsyncOpenAI
+from tqdm import tqdm
+# Mapping providers to their clients and models
+provider_to_models = {
+    "b10": {
+        "8b": "meta-llama/Llama-3.1-8B-Instruct",
+        "70b": "meta-llama/Llama-3.1-70B-Instruct",
+        "405b": "meta-llama/Llama-3.1-405B-Instruct",
+    },
+    "oai": {
+        "8b": "meta-llama/Llama-3.1-8B-Instruct",
+        "70b": "meta-llama/Llama-3.1-70B-Instruct",
+        "405b": "meta-llama/Llama-3.1-405B-Instruct",
+    },
+    "sgl": {
+        "8b": "meta-llama/Llama-3.1-8B-Instruct",
+        "70b": "meta-llama/Llama-3.1-70B-Instruct",
+        "405b": "meta-llama/Llama-3.1-405B-Instruct",
+    },
+}
+async def fetch_responses(
+    client, prompt, semaphore, index, provider, model_size, output_dir, max_tokens
+):
+    output_file = os.path.join(output_dir, f"response_{index}.pkl")
+    if os.path.exists(output_file):
+        print(f"File {output_file} already exists, skipping.")
+        return
+    async with semaphore:
+        response = await client.completions.create(
+            model=provider_to_models[provider][model_size],
+            prompt=prompt,
+            temperature=0.0,
+            max_tokens=max_tokens,
+        )
+        if isinstance(response, openai.BadRequestError):
+            with open(output_file, "wb") as f:
+                pickle.dump("bad_response", f)
+        assert isinstance(response, openai.types.completion.Completion)
+        # Save response to a file
+        with open(output_file, "wb") as f:
+            pickle.dump(response, f)
+TASK_TO_MAX_TOKENS = {
+    "evals__mmlu__details": 1,
+    "evals__mmlu__0_shot__cot__details": 1024,
+    # Official meta uses 1024, but a small % (.05) of questions are answered correctly after relaxing
+    "evals__mmlu_pro__details": 2048,
+    "evals__gsm8k__details": 1024,
+}
+TASK_TO_EVAL_SET = {
+    "mmlu": "evals__mmlu__details",
+    "mmlu_cot": "evals__mmlu__0_shot__cot__details",
+    "mmlu_pro": "evals__mmlu_pro__details",
+    "gsm8k": "evals__gsm8k__details",
+}
+class CustomAsyncHTTPXClient(httpx.AsyncClient):
+    async def send(self, request: httpx.Request, *args, **kwargs) -> httpx.Response:
+        request.url = httpx.URL(
+            f"https://model-{os.getenv('MODEL_ID')}.api.baseten.co/development/predict"
+        )
+        return await super().send(request, *args, **kwargs)
+def get_client(provider):
+    if provider not in "b10":
+        if os.getenv("OPENAI_API_KEY") == None:
+            os.environ["OPENAI_API_KEY"] = "EMPTY"
+    return {
+        "oai": AsyncOpenAI(base_url="http://127.0.0.1:8000/v1/"),
+        "b10": AsyncOpenAI(
+            api_key=f"Api-Key {os.getenv('OPENAI_API_KEY')}",
+            base_url=f"https://model-{os.getenv('MODEL_ID')}.api.baseten.co/development/predict",
+            http_client=CustomAsyncHTTPXClient(),
+        ),
+        "sgl": AsyncOpenAI(base_url="http://127.0.0.1:30000/v1/"),
+    }[provider]
+# Define the benchmark function
+async def benchmark(args):
+    ds = load_dataset(
+        "meta-llama/Llama-3.1-405B-Instruct-evals",
+        f"Llama-3.1-405B-Instruct-{TASK_TO_EVAL_SET[args.task]}",
+    )
+    semaphore = asyncio.Semaphore(args.concurrency)  # Limit to 16 concurrent tasks
+    if args.num_examples is None:
+        args.num_examples = len(ds["latest"]["input_final_prompts"])
+    prompts = ds["latest"]["input_final_prompts"][: args.num_examples]
+    # Create the output directory if it does not exist
+    os.makedirs(args.output_dir, exist_ok=True)
+    tasks = []
+    # Create the tasks with tqdm progress bar
+    max_tokens = TASK_TO_MAX_TOKENS[TASK_TO_EVAL_SET[args.task]]
+    client = get_client(args.provider)
+    for idx, prompt in enumerate(tqdm(prompts, desc="Creating tasks")):
+        tasks.append(
+            asyncio.create_task(
+                fetch_responses(
+                    client,
+                    f"<|begin_of_text|>{prompt[0]}",
+                    semaphore,
+                    idx,
+                    args.provider,
+                    args.model_size,
+                    args.output_dir,
+                    max_tokens=max_tokens,
+                )
+            )
+        )
+    # Run the tasks with tqdm progress bar
+    for future in tqdm(
+        asyncio.as_completed(tasks), total=len(tasks), desc="Processing tasks"
+    ):
+        await future
+def get_mmlu_answer(response):
+    if response is not None:
+        return response.choices[0].text.lstrip().rstrip().upper().replace(".", "")
+    return None
+def get_mmlu_cot_answer(response):
+    pattern = r"The best answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        return match.group(1).replace(".", "").replace("*", "")
+    pattern = r"the best answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        return match.group(1).replace(".", "")
+    pattern = r"The correct answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        return match.group(1).replace(".", "")
+    pattern = r"the correct answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        return match.group(1).replace(".", "")
+def get_answer_gsm8k(response):
+    pattern = r"The final answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        s = match.group(1)
+        for ok_symbol in ["%", "$"]:
+            s = s.replace(ok_symbol, "")
+        return s
+TASK_TO_ANSWER_EXTRACTOR = {
+    "evals__mmlu__details": get_mmlu_answer,
+    "evals__mmlu__0_shot__cot__details": get_mmlu_cot_answer,
+    "evals__gsm8k__details": get_answer_gsm8k,
+    "evals__mmlu_pro__details": get_mmlu_cot_answer,
+}
+def get_dataset_from_task(task, response_path, model_size):
+    ds_405b = load_dataset(
+        f"meta-llama/Llama-3.1-405B-Instruct-evals",
+        f"Llama-3.1-405B-Instruct-{task}",
+    )
+    ds_405b_hash_order = [x[0] for x in ds_405b["latest"]["input_final_prompts_hash"]]
+    if "70b" in model_size or "8b" in model_size:
+        if "70" in model_size:
+            ref_model_ds = load_dataset(
+                f"meta-llama/Llama-3.1-70B-Instruct-evals",
+                f"Llama-3.1-70B-Instruct-{task}",
+            )
+        else:
+            ref_model_ds = load_dataset(
+                f"meta-llama/Llama-3.1-8B-Instruct-evals",
+                f"Llama-3.1-8B-Instruct-{task}",
+            )
+        hash_to_row = {}
+        for row in ref_model_ds["latest"]:
+            hash_to_row[row["input_final_prompts_hash"][0]] = row
+        reordered_rows = []
+        for prompt_hash in ds_405b_hash_order:
+            reordered_rows.append(hash_to_row[prompt_hash])
+        ref_model_ds["latest"] = reordered_rows
+        return ref_model_ds
+    return ds_405b
+def analyze(task, response_path, model_size):
+    ds = get_dataset_from_task(task, response_path, model_size)
+    responses = []
+    total = len(ds["latest"])
+    for i in range(0, total):
+        response = pickle.load(
+            open(os.path.join(response_path, f"response_{i}.pkl"), "rb")
+        )
+        responses.append(response)
+    @dataclass
+    class Stats:
+        correct: int = 0
+        total: int = 0
+        meta_correct: int = 0
+        average: float = None
+    subtask_name_to_stats = defaultdict(lambda: Stats())
+    for response, ds_row in zip(responses, ds["latest"]):
+        model_answer = TASK_TO_ANSWER_EXTRACTOR[task](response)
+        subtask = ds_row["subtask_name"]
+        is_eval_correct = model_answer in ds_row["input_correct_responses"]
+        if is_eval_correct:
+            subtask_name_to_stats[subtask].correct += 1
+        if ds_row["is_correct"]:
+            subtask_name_to_stats[subtask].meta_correct += 1
+        subtask_name_to_stats[subtask].total += 1
+    micro_stats = Stats()
+    for subtask, stats in subtask_name_to_stats.items():
+        stats.average = stats.correct / stats.total
+        stats.meta_average = stats.meta_correct / stats.total
+        micro_stats.correct += stats.correct
+        micro_stats.total += stats.total
+        micro_stats.meta_correct += stats.meta_correct
+    micro_stats.average = micro_stats.correct / micro_stats.total
+    micro_stats.meta_average = micro_stats.meta_correct / micro_stats.total
+    print("Macro average", np.mean([x.average for x in subtask_name_to_stats.values()]))
+    print(
+        "Meta Macro average",
+        np.mean([x.meta_average for x in subtask_name_to_stats.values()]),
+    )
+    print("Micro average", micro_stats.average)
+    print("Meta Micro average", micro_stats.meta_average)
+# Entry point for the script
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Script to run model with specified parameters."
+    )
+    parser.add_argument(
+        "--model-size",
+        type=str,
+        default="8b",
+        help="Size of the model (e.g., 8b or 70b)",
+    )
+    parser.add_argument(
+        "--provider",
+        type=str,
+        default="sgl",
+        help="Provider name (e.g., sgl, oai, b10)",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=True,
+        help="Task (e.g., mmlu, mmlu_cot, mmlu_pro, gsm8k)",
+    )
+    parser.add_argument(
+        "--num-examples", type=int, default=None, help="Number of examples to process"
+    )
+    parser.add_argument("--concurrency", type=int, default=16)
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="tmp-output-dir",
+        help="Directory to save responses",
+    )
+    args = parser.parse_args()
+    asyncio.run(benchmark(args))
+    analyze(TASK_TO_EVAL_SET[args.task], args.output_dir, args.model_size)
+    shutil.rmtree("tmp-output-dir", ignore_errors=True)

sglang/eval/loogle_eval.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import argparse
+import asyncio
+import os
+import pickle
+from pathlib import Path
+from typing import List
+import openai
+import torch
+from bert_score import BERTScorer
+from datasets import load_dataset
+from tqdm import tqdm
+def get_client(api_url: str) -> openai.AsyncOpenAI:
+    if os.getenv("OPENAI_API_KEY") is None:
+        os.environ["OPENAI_API_KEY"] = "EMPTY"
+    return openai.AsyncOpenAI(base_url=api_url)
+def get_dataset():
+    return load_dataset("bigai-nlco/LooGLE", "longdep_qa", split="test")
+async def fetch_response(
+    client: openai.AsyncOpenAI,
+    context: str,
+    question: str,
+    semaphore: asyncio.Semaphore,
+    index: int,
+    model: str,
+    output_dir: Path,
+):
+    output_file = output_dir / f"response_{index}.pkl"
+    if output_file.exists():
+        return
+    prompt = (
+        "Please answer the question based on the long texts below.\n"
+        f"{context}\n"
+        f"Question: {question}\n"
+        "Answer:"
+    )
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ]
+    async with semaphore:
+        try:
+            response = await client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=0.0,
+                max_tokens=512,
+            )
+        except openai.BadRequestError as e:
+            with open(output_file, "wb") as f:
+                pickle.dump({"error": str(e)}, f)
+            return
+    with open(output_file, "wb") as f:
+        pickle.dump(response, f)
+async def benchmark(args):
+    dataset = get_dataset()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    client = get_client(args.api_url)
+    semaphore = asyncio.Semaphore(args.max_concurrency)
+    tasks: List[asyncio.Task] = []
+    for idx, ex in enumerate(dataset):
+        if idx >= args.num_prompts:
+            break
+        tasks.append(
+            asyncio.create_task(
+                fetch_response(
+                    client,
+                    ex["context"],
+                    ex["question"],
+                    semaphore,
+                    idx,
+                    args.model,
+                    output_dir,
+                )
+            )
+        )
+    for _ in tqdm(
+        asyncio.as_completed(tasks), total=len(tasks), desc="Running benchmark"
+    ):
+        await _
+def analyse(args):
+    dataset = get_dataset()
+    output_dir = Path(args.output_dir)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    scorer = BERTScorer(lang="en", device=device)
+    hyps: List[str] = []
+    refs: List[str] = []
+    for idx, ex in enumerate(tqdm(dataset, desc="Loading responses")):
+        if idx >= args.num_prompts:
+            break
+        pkl_file = output_dir / f"response_{idx}.pkl"
+        if not pkl_file.exists():
+            raise FileNotFoundError(pkl_file)
+        response = pickle.load(open(pkl_file, "rb"))
+        if isinstance(response, dict) and "error" in response:
+            continue
+        hyps.append(response.choices[0].message.content.strip())
+        refs.append(ex["answer"])
+    if not hyps:
+        print("No valid responses to score!")
+        return
+    batch_size = 64
+    all_f1: List[float] = []
+    for i in tqdm(range(0, len(hyps), batch_size), desc="Scoring batches"):
+        h_batch = hyps[i : i + batch_size]
+        r_batch = refs[i : i + batch_size]
+        _, _, f1_scores = scorer.score(h_batch, r_batch, verbose=False)
+        all_f1.extend([float(x) for x in f1_scores])
+    avg = sum(all_f1) / len(all_f1)
+    print(f"Average BERTScore (F1): {avg:.2%}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run benchmark and evaluation in one go."
+    )
+    parser.add_argument(
+        "--api-url",
+        default="http://127.0.0.1:30000/v1",
+        help="OpenAI‑compatible API base URL",
+    )
+    parser.add_argument(
+        "--model",
+        default="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+        help="Model name or ID, only used for model name",
+    )
+    parser.add_argument(
+        "--max-concurrency", type=int, default=144, help="Maximum concurrent requests"
+    )
+    parser.add_argument(
+        "--output-dir", default="tmp-output-dir", help="Directory for cached responses"
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=10000, help="Number of prompts to run"
+    )
+    args = parser.parse_args()
+    asyncio.run(benchmark(args))
+    analyse(args)

sglang/global_config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Global configurations"""
+# FIXME: deprecate this file and move all usage to sglang.srt.environ or sglang.__init__.py
+class GlobalConfig:
+    """
+    Store some global constants.
+    """
+    def __init__(self):
+        # Verbosity level
+        # 0: do not output anything
+        # 2: output final text after every run
+        self.verbosity = 0
+        # Default backend of the language
+        self.default_backend = None
+        # Output tokenization configs
+        self.skip_special_tokens_in_output = True
+        self.spaces_between_special_tokens_in_out = True
+        # Language frontend interpreter optimization configs
+        self.enable_precache_with_tracing = True
+        self.enable_parallel_encoding = True
+global_config = GlobalConfig()

sglang/jit_kernel/.clang-format ADDED Viewed

	@@ -0,0 +1,19 @@

+BasedOnStyle: Google
+IndentWidth: 2
+ColumnLimit: 120
+AllowShortFunctionsOnASingleLine: Empty
+DerivePointerAlignment: false
+PointerAlignment: Left
+NamespaceIndentation: None
+SortIncludes: true
+AllowShortLoopsOnASingleLine: false
+BinPackParameters: false              # Prevents packing parameters in declarations
+BinPackArguments: false               # Prevents packing arguments in function calls
+AlignAfterOpenBracket: AlwaysBreak    # Forces a break after the opening parenthesis
+AlignOperands: Align                  # Aligns arguments vertically
+PenaltyBreakBeforeFirstCallParameter: 1  # Encourages breaking before the first argument
+PenaltyReturnTypeOnItsOwnLine: 100    # Keeps return type with function name
+IncludeCategories:
+  - Regex: '^<sgl_kernel/.*>$'
+    Priority: 0

sglang/jit_kernel/__pycache__/hicache.cpython-311.pyc ADDED Viewed

Binary file (4.35 kB). View file

sglang/jit_kernel/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (6.87 kB). View file

sglang/jit_kernel/csrc/cuda_wait_value.cuh ADDED Viewed

	@@ -0,0 +1,38 @@

+#include <sgl_kernel/tensor.h>
+#include <sgl_kernel/utils.cuh>
+#include <cuda_runtime_api.h>
+#include <cstdint>
+namespace {
+__global__ void wait_flag_kernel(const int32_t* flag, int32_t target) {
+  const volatile int32_t* vflag = (volatile const int32_t*)flag;
+  while (*vflag != target) {
+#if __CUDA_ARCH__ >= 700
+    __nanosleep(100);
+#else
+    // Note: This falls back to an inefficient busy-wait on pre-Volta architectures.
+#endif
+  }
+}
+auto stream_wait_value(const tvm::ffi::TensorView flag, std::int32_t value) -> void {
+  using namespace host;
+  auto length = SymbolicSize{"length"};
+  TensorMatcher({length}).with_dtype<int32_t>().with_device<kDLCUDA>().verify(flag);
+  RuntimeCheck(length.unwrap() >= 1, "wait_flag expects a non-empty tensor.");
+  auto* ptr = static_cast<std::int32_t*>(flag.data_ptr());
+  const auto stream = LaunchKernel::resolve_device(flag.device());
+  constexpr int blocks = 1;
+  constexpr int threads = 1;
+  wait_flag_kernel<<<blocks, threads, 0, stream>>>(ptr, value);
+  RuntimeDeviceCheck(cudaGetLastError());
+}
+}  // namespace

sglang/jit_kernel/csrc/hicache.cuh ADDED Viewed

	@@ -0,0 +1,264 @@

+#include <sgl_kernel/tensor.h>
+#include <sgl_kernel/utils.cuh>
+#include <sgl_kernel/utils.h>
+#include <sgl_kernel/warp.cuh>
+#include <dlpack/dlpack.h>
+#include <algorithm>
+#include <concepts>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+namespace {
+struct HicacheKernelParams {
+  void* __restrict__ k_cache_dst;
+  void* __restrict__ v_cache_dst;
+  const void* __restrict__ indices_dst;
+  void* __restrict__ k_cache_src;
+  void* __restrict__ v_cache_src;
+  const void* __restrict__ indices_src;
+  std::size_t length;
+  std::size_t kv_cache_src_stride;
+  std::size_t kv_cache_dst_stride;
+  std::size_t num_layers = 0;  // only used in all_layer transfer
+};
+template <
+    std::integral T,
+    std::size_t kElementSize,
+    std::size_t kUnroll,
+    std::size_t kBlockQuota,
+    std::size_t kNumThreads,
+    std::size_t kMaxOccupancy>
+__global__ __launch_bounds__(kNumThreads, kMaxOccupancy) void hicache_transfer_per_layer(
+    const __grid_constant__ HicacheKernelParams params) {
+  // each warp acts as a worker
+  using namespace device;
+  static_assert(kNumThreads % kWarpThreads == 0);
+  static_assert(kWarpThreads % kUnroll == 0);
+  constexpr auto kWarpThreads = device::kWarpThreads / kUnroll;
+  constexpr auto kWarpsPerBlock = kNumThreads / kWarpThreads;
+  constexpr auto kWorkers = kWarpsPerBlock * kBlockQuota;
+  const auto& [
+    k_cache_dst, v_cache_dst, indices_dst, // dst
+    k_cache_src, v_cache_src, indices_src, // src
+    length, kv_cache_src_stride, kv_cache_dst_stride, _ // metadata
+  ] = params;
+  const auto warp_id = blockIdx.x * kWarpsPerBlock + threadIdx.x / kWarpThreads;
+  // force to transfer 128 bytes per iteration
+  // since the PCIe transaction size is 128 bytes aligned
+  constexpr auto kGranularity = 128 / kWarpThreads;
+  for (auto i = warp_id; i < length; i += kWorkers) {
+    const auto pos_src = static_cast<const T*>(indices_src)[i];
+    const auto pos_dst = static_cast<const T*>(indices_dst)[i];
+    const auto src_k = pointer::offset(k_cache_src, pos_src * kv_cache_src_stride);
+    const auto dst_k = pointer::offset(k_cache_dst, pos_dst * kv_cache_dst_stride);
+    const auto src_v = pointer::offset(v_cache_src, pos_src * kv_cache_src_stride);
+    const auto dst_v = pointer::offset(v_cache_dst, pos_dst * kv_cache_dst_stride);
+    const auto vec_k = warp::load_vec<kElementSize, kGranularity, kWarpThreads>(src_k);
+    const auto vec_v = warp::load_vec<kElementSize, kGranularity, kWarpThreads>(src_v);
+    warp::store_vec<kElementSize, kGranularity, kWarpThreads>(dst_k, vec_k);
+    warp::store_vec<kElementSize, kGranularity, kWarpThreads>(dst_v, vec_v);
+  }
+}
+template <
+    std::integral T,
+    std::size_t kElementSize,
+    std::size_t kUnroll,
+    std::size_t kBlockQuota,
+    std::size_t kNumThreads,
+    std::size_t kMaxOccupancy>
+__global__ __launch_bounds__(kNumThreads, kMaxOccupancy) void hicache_transfer_all_layer(
+    const __grid_constant__ HicacheKernelParams params) {
+  // each warp acts as a worker
+  using namespace device;
+  using src_ptr_t = std::add_pointer_t<const void* const>;
+  using dst_ptr_t = std::add_pointer_t<void* const>;
+  static_assert(kNumThreads % kWarpThreads == 0);
+  constexpr auto kWarpThreads = device::kWarpThreads / kUnroll;
+  constexpr auto kWarpsPerBlock = static_cast<uint32_t>(kNumThreads) / kWarpThreads;
+  constexpr auto kWorkers = kWarpsPerBlock * kBlockQuota;
+  const auto& [
+    k_ptr_dst, v_ptr_dst, indices_dst, // dst
+    k_ptr_src, v_ptr_src, indices_src, // src
+    length, kv_cache_src_stride, kv_cache_dst_stride, num_layers // metadata
+  ] = params;
+  const auto warp_id = blockIdx.x * kWarpsPerBlock + threadIdx.x / kWarpThreads;
+  // force to transfer 128 bytes per iteration
+  // since the PCIe transaction size is 128 bytes aligned
+  constexpr auto kGranularity = 128 / kWarpThreads;
+  for (auto i = warp_id; i < length; i += kWorkers) {
+    const auto pos_src = static_cast<const T*>(indices_src)[i];
+    const auto pos_dst = static_cast<const T*>(indices_dst)[i];
+    for (std::size_t layer = 0; layer < num_layers; ++layer) {
+      const auto k_cache_src = static_cast<src_ptr_t>(k_ptr_src)[layer];
+      const auto v_cache_src = static_cast<src_ptr_t>(v_ptr_src)[layer];
+      const auto k_cache_dst = static_cast<dst_ptr_t>(k_ptr_dst)[layer];
+      const auto v_cache_dst = static_cast<dst_ptr_t>(v_ptr_dst)[layer];
+      const auto src_k = pointer::offset(k_cache_src, pos_src * kv_cache_src_stride);
+      const auto dst_k = pointer::offset(k_cache_dst, pos_dst * kv_cache_dst_stride);
+      const auto src_v = pointer::offset(v_cache_src, pos_src * kv_cache_src_stride);
+      const auto dst_v = pointer::offset(v_cache_dst, pos_dst * kv_cache_dst_stride);
+      const auto vec_k = warp::load_vec<kElementSize, kGranularity, kWarpThreads>(src_k);
+      const auto vec_v = warp::load_vec<kElementSize, kGranularity, kWarpThreads>(src_v);
+      warp::store_vec<kElementSize, kGranularity, kWarpThreads>(dst_k, vec_k);
+      warp::store_vec<kElementSize, kGranularity, kWarpThreads>(dst_v, vec_v);
+    }
+  }
+}
+template <
+    std::size_t kElementSize,
+    std::size_t kUnroll,
+    std::size_t kBlockQuota,
+    std::size_t kNumThreads,
+    std::size_t kMaxOccupancy>
+struct HiCacheKernel {
+  template <typename T>
+  static constexpr auto _kernel_one =
+      hicache_transfer_per_layer<T, kElementSize, kUnroll, kBlockQuota, kNumThreads, kMaxOccupancy>;
+  template <typename T>
+  static constexpr auto _kernel_all =
+      hicache_transfer_all_layer<T, kElementSize, kUnroll, kBlockQuota, kNumThreads, kMaxOccupancy>;
+  static void run_one(
+      const tvm::ffi::TensorView k_cache_dst,
+      const tvm::ffi::TensorView v_cache_dst,
+      const tvm::ffi::TensorView indices_dst,
+      const tvm::ffi::TensorView k_cache_src,
+      const tvm::ffi::TensorView v_cache_src,
+      const tvm::ffi::TensorView indices_src) {
+    using namespace host;
+    auto D = SymbolicSize{"D"};  // cache dimension
+    auto N = SymbolicSize{"N"};  // src kv stride
+    auto M = SymbolicSize{"M"};  // dst kv stride
+    auto L = SymbolicSize{"L"};  // indices length
+    auto cache_dtype = SymbolicDType{};
+    auto indices_dtype = SymbolicDType{};
+    auto indices_device = SymbolicDevice{};
+    TensorMatcher({-1, D})  //
+        .with_strides({N, 1})
+        .with_dtype(cache_dtype)
+        .with_device<kDLCUDA, kDLCUDAHost, kDLCPU>()
+        .verify(k_cache_src)
+        .verify(v_cache_src);
+    TensorMatcher({-1, D})  //
+        .with_strides({M, 1})
+        .with_dtype(cache_dtype)
+        .with_device<kDLCUDA, kDLCUDAHost, kDLCPU>()
+        .verify(k_cache_dst)
+        .verify(v_cache_dst);
+    TensorMatcher({L})  //
+        .with_dtype<int32_t, int64_t>(indices_dtype)
+        .with_device<kDLCUDA>(indices_device)
+        .verify(indices_src)
+        .verify(indices_dst);
+    // verify dimension match
+    const auto dtype_size = dtype_bytes(cache_dtype.unwrap());
+    const auto element_bytes = D.unwrap() * dtype_size;
+    RuntimeCheck(kElementSize == element_bytes, "HicacheKernel: cache dimension mismatch.");
+    const auto k_cache_dst_ptr = k_cache_dst.data_ptr();
+    const auto v_cache_dst_ptr = v_cache_dst.data_ptr();
+    const auto k_cache_src_ptr = k_cache_src.data_ptr();
+    const auto v_cache_src_ptr = v_cache_src.data_ptr();
+    const auto indices_dst_ptr = indices_dst.data_ptr();
+    const auto indices_src_ptr = indices_src.data_ptr();
+    const auto length = static_cast<std::size_t>(L.unwrap());
+    const auto kv_cache_src_stride = static_cast<std::size_t>(N.unwrap()) * dtype_size;
+    const auto kv_cache_dst_stride = static_cast<std::size_t>(M.unwrap()) * dtype_size;
+    const auto use_int32 = indices_dtype.unwrap().bits == 32;
+    const auto device = indices_device.unwrap();
+    constexpr auto kWorkersPerBlock = kNumThreads / (device::kWarpThreads / kUnroll);
+    const auto num_blocks = std::min(div_ceil(length, kWorkersPerBlock), kBlockQuota);
+    const auto params = HicacheKernelParams{
+        .k_cache_dst = k_cache_dst_ptr,
+        .v_cache_dst = v_cache_dst_ptr,
+        .indices_dst = indices_dst_ptr,
+        .k_cache_src = k_cache_src_ptr,
+        .v_cache_src = v_cache_src_ptr,
+        .indices_src = indices_src_ptr,
+        .length = length,
+        .kv_cache_src_stride = kv_cache_src_stride,
+        .kv_cache_dst_stride = kv_cache_dst_stride,
+    };
+    const auto kernel = use_int32 ? _kernel_one<int32_t> : _kernel_one<int64_t>;
+    LaunchKernel(num_blocks, kNumThreads, device)(kernel, params);
+  }
+  static void run_all(
+      const tvm::ffi::TensorView k_ptr_dst,
+      const tvm::ffi::TensorView v_ptr_dst,
+      const tvm::ffi::TensorView indices_dst,
+      const tvm::ffi::TensorView k_ptr_src,
+      const tvm::ffi::TensorView v_ptr_src,
+      const tvm::ffi::TensorView indices_src,
+      const std::size_t kv_src_stride,
+      const std::size_t kv_dst_stride) {
+    using namespace host;
+    auto N = SymbolicSize{"N"};  // num layers
+    auto L = SymbolicSize{"L"};  // indices length
+    auto dtype_ = SymbolicDType{};
+    auto device_ = SymbolicDevice{};
+    TensorMatcher({N})  //
+        .with_dtype<uint64_t>()
+        .with_device<kDLCUDA>(device_)
+        .verify(k_ptr_src)
+        .verify(v_ptr_src)
+        .verify(k_ptr_dst)
+        .verify(v_ptr_dst);
+    TensorMatcher({L})  //
+        .with_dtype<int32_t, int64_t>(dtype_)
+        .with_device<kDLCUDA>(device_)
+        .verify(indices_src)
+        .verify(indices_dst);
+    // verify dimension match
+    const auto k_cache_dst_ptr = k_ptr_dst.data_ptr();
+    const auto v_cache_dst_ptr = v_ptr_dst.data_ptr();
+    const auto k_cache_src_ptr = k_ptr_src.data_ptr();
+    const auto v_cache_src_ptr = v_ptr_src.data_ptr();
+    const auto indices_dst_ptr = indices_dst.data_ptr();
+    const auto indices_src_ptr = indices_src.data_ptr();
+    const auto length = static_cast<std::size_t>(L.unwrap());
+    const auto use_int32 = dtype_.unwrap().bits == 32;
+    const auto device = device_.unwrap();
+    constexpr auto kWorkersPerBlock = kNumThreads / (device::kWarpThreads / kUnroll);
+    const auto num_blocks = std::min(div_ceil(length, kWorkersPerBlock), kBlockQuota);
+    const auto params = HicacheKernelParams{
+        .k_cache_dst = k_cache_dst_ptr,
+        .v_cache_dst = v_cache_dst_ptr,
+        .indices_dst = indices_dst_ptr,
+        .k_cache_src = k_cache_src_ptr,
+        .v_cache_src = v_cache_src_ptr,
+        .indices_src = indices_src_ptr,
+        .length = length,
+        .kv_cache_src_stride = kv_src_stride,
+        .kv_cache_dst_stride = kv_dst_stride,
+        .num_layers = static_cast<std::size_t>(N.unwrap()),
+    };
+    const auto kernel = use_int32 ? _kernel_all<int32_t> : _kernel_all<int64_t>;
+    LaunchKernel(num_blocks, kNumThreads, device)(kernel, params);
+  }
+};
+}  // namespace

sglang/jit_kernel/cuda_wait_value.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from __future__ import annotations
+from functools import lru_cache
+from typing import TYPE_CHECKING
+import torch
+from sglang.jit_kernel.utils import load_jit
+if TYPE_CHECKING:
+    import torch
+    from tvm_ffi.module import Module
+@lru_cache(maxsize=1)
+def _jit_stream_wait_value_module() -> Module:
+    return load_jit(
+        "cuda_wait_value",
+        cuda_files=["cuda_wait_value.cuh"],
+        cuda_wrappers=[("stream_wait_value", "stream_wait_value")],
+    )
+def stream_wait_value(flag: torch.Tensor, value: int) -> None:
+    module = _jit_stream_wait_value_module()
+    module.stream_wait_value(flag, value)
+class Event:
+    def __init__(self) -> None:
+        self.flag = torch.zeros(1, dtype=torch.int32, device="cuda")
+    def record(self, value: int = 1) -> None:
+        self.flag[0] = value
+    def wait(self, value: int = 1) -> None:
+        stream_wait_value(self.flag, value)
+def test_wait_before_record(event: Event | torch.cuda.Event):
+    stream_a = torch.cuda.Stream()
+    stream_b = torch.cuda.Stream()
+    with torch.cuda.stream(stream_a):
+        event.wait()
+    stream_a.synchronize()
+    with torch.cuda.stream(stream_b):
+        event.record()
+def main():
+    import threading
+    import time
+    block_thead = threading.Thread(
+        target=test_wait_before_record, args=(Event(),), daemon=True
+    )
+    block_thead.start()
+    non_block_thread = threading.Thread(
+        target=test_wait_before_record, args=(torch.cuda.Event(),)
+    )
+    non_block_thread.start()
+    print("Checking if custom Event blocks the stream...", flush=True)
+    for _ in range(5):
+        print(f"{block_thead.is_alive()=}, {non_block_thread.is_alive()=}", flush=True)
+        time.sleep(1)
+    assert block_thead.is_alive(), "Custom Event did not block as expected"
+    assert not non_block_thread.is_alive(), "torch.cuda.Event should not block"
+    print("=" * 40)
+    print("Test completed successfully.")
+if __name__ == "__main__":
+    main()

sglang/jit_kernel/hicache.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from __future__ import annotations
+import logging
+from functools import lru_cache
+from typing import TYPE_CHECKING
+from sglang.jit_kernel.utils import load_jit, make_cpp_args
+if TYPE_CHECKING:
+    import torch
+    from tvm_ffi.module import Module
+DEFAULT_BLOCK_QUOTA = 2
+@lru_cache(maxsize=None)
+def _jit_hicache_module(*, element_size: int, unroll: int, block_quota: int) -> Module:
+    num_threads, occupancy = 1024, 1
+    args = make_cpp_args(
+        element_size,
+        unroll,
+        block_quota,
+        num_threads,
+        occupancy,
+    )
+    return load_jit(
+        "hicache",
+        *args,
+        cuda_files=["hicache.cuh"],
+        cuda_wrappers=[
+            ("launch_one", f"HiCacheKernel<{args}>::run_one"),
+            ("launch_all", f"HiCacheKernel<{args}>::run_all"),
+        ],
+    )
+def can_use_hicache_jit_kernel(
+    *,
+    element_size: int,
+    unroll: int | None = None,  # can be tuned for performance
+    block_quota: int | None = None,  # can be tuned for less interference
+) -> bool:
+    try:
+        unroll = unroll or _default_unroll(element_size)
+        block_quota = block_quota or DEFAULT_BLOCK_QUOTA
+        _jit_hicache_module(
+            element_size=element_size,
+            unroll=unroll,
+            block_quota=block_quota,
+        )
+        return True
+    except Exception as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(f"Failed to load JIT HiCache kernel: {e}")
+        return False
+def _default_unroll(element_size: int) -> int:
+    if element_size <= 512:
+        return 4
+    if element_size <= 1024:
+        return 2
+    # fallback: no unroll
+    return 1
+def transfer_hicache_one_layer(
+    k_cache_dst: torch.Tensor,
+    v_cache_dst: torch.Tensor,
+    indices_dst: torch.Tensor,
+    k_cache_src: torch.Tensor,
+    v_cache_src: torch.Tensor,
+    indices_src: torch.Tensor,
+    *,
+    element_dim: int | None = None,
+    unroll: int | None = None,  # can be tuned for performance
+    block_quota: int | None = None,  # can be tuned for less interference
+) -> None:
+    element_dim = element_dim or k_cache_dst.size(-1)
+    k_cache_src = k_cache_src.view(-1, element_dim)
+    v_cache_src = v_cache_src.view(-1, element_dim)
+    k_cache_dst = k_cache_dst.view(-1, element_dim)
+    v_cache_dst = v_cache_dst.view(-1, element_dim)
+    element_size = element_dim * k_cache_dst.element_size()
+    block_quota = block_quota or DEFAULT_BLOCK_QUOTA
+    unroll = unroll or _default_unroll(element_size)
+    module = _jit_hicache_module(
+        element_size=element_size,
+        unroll=unroll,
+        block_quota=block_quota,
+    )
+    module.launch_one(
+        k_cache_dst,
+        v_cache_dst,
+        indices_dst,
+        k_cache_src,
+        v_cache_src,
+        indices_src,
+    )
+def transfer_hicache_all_layer(
+    k_ptr_dst: torch.Tensor,
+    v_ptr_dst: torch.Tensor,
+    indices_dst: torch.Tensor,
+    k_ptr_src: torch.Tensor,
+    v_ptr_src: torch.Tensor,
+    indices_src: torch.Tensor,
+    *,
+    kv_cache_src_stride_bytes: int,
+    kv_cache_dst_stride_bytes: int,
+    element_size: int | None = None,
+    unroll: int | None = None,  # can be tuned for performance
+    block_quota: int | None = None,  # can be tuned for less interference
+) -> None:
+    if element_size is None:  # assume both contiguous
+        assert kv_cache_dst_stride_bytes == kv_cache_src_stride_bytes
+        element_size = kv_cache_dst_stride_bytes
+    block_quota = block_quota or DEFAULT_BLOCK_QUOTA
+    unroll = unroll or _default_unroll(element_size)
+    module = _jit_hicache_module(
+        element_size=element_size,
+        unroll=unroll,
+        block_quota=block_quota,
+    )
+    module.launch_all(
+        k_ptr_dst,
+        v_ptr_dst,
+        indices_dst,
+        k_ptr_src,
+        v_ptr_src,
+        indices_src,
+        kv_cache_src_stride_bytes,
+        kv_cache_dst_stride_bytes,
+    )

sglang/jit_kernel/include/sgl_kernel/tensor.h ADDED Viewed

	@@ -0,0 +1,487 @@

+#pragma once
+#include <sgl_kernel/utils.h>
+#include <dlpack/dlpack.h>
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
+#include <algorithm>
+#include <array>
+#include <concepts>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <optional>
+#include <ranges>
+#include <source_location>
+#include <span>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+namespace host {
+namespace stdr = std::ranges;
+namespace stdv = std::views;
+namespace details {
+struct SizeRef;
+struct DTypeRef;
+struct DeviceRef;
+template <typename T>
+struct dtype_trait {};
+template <std::integral T>
+struct dtype_trait<T> {
+  inline static constexpr auto value = DLDataType{
+      .code = std::is_signed_v<T> ? DLDataTypeCode::kDLInt : DLDataTypeCode::kDLUInt,
+      .bits = static_cast<std::uint8_t>(sizeof(T) * 8),
+      .lanes = 1};
+};
+template <std::floating_point T>
+struct dtype_trait<T> {
+  inline static constexpr auto value =
+      DLDataType{.code = DLDataTypeCode::kDLFloat, .bits = static_cast<std::uint8_t>(sizeof(T) * 8), .lanes = 1};
+};
+inline constexpr auto kAnyDeviceID = -1;
+inline constexpr auto kAnySize = static_cast<int64_t>(-1);
+inline constexpr auto kNullSize = static_cast<int64_t>(-1);
+inline constexpr auto kNullDType = static_cast<DLDataTypeCode>(18u);
+inline constexpr auto kNullDevice = static_cast<DLDeviceType>(-1);
+template <typename... Ts>
+inline constexpr auto kDTypeList = std::array<DLDataType, sizeof...(Ts)>{dtype_trait<Ts>::value...};
+template <DLDeviceType... Codes>
+inline constexpr auto kDeviceList = std::array<DLDevice, sizeof...(Codes)>{
+    DLDevice{.device_type = static_cast<DLDeviceType>(Codes), .device_id = kAnyDeviceID}...};
+template <typename T>
+struct PrintAbleSpan {
+  explicit PrintAbleSpan(std::span<const T> data) : data(data) {}
+  std::span<const T> data;
+};
+// define DLDataType comparison and printing in root namespace
+inline constexpr auto kDeviceStringMap = [] {
+  constexpr auto map = std::array<std::pair<DLDeviceType, const char*>, 16>{
+      std::pair{DLDeviceType::kDLCPU, "cpu"},
+      std::pair{DLDeviceType::kDLCUDA, "cuda"},
+      std::pair{DLDeviceType::kDLCUDAHost, "cuda_host"},
+      std::pair{DLDeviceType::kDLOpenCL, "opencl"},
+      std::pair{DLDeviceType::kDLVulkan, "vulkan"},
+      std::pair{DLDeviceType::kDLMetal, "metal"},
+      std::pair{DLDeviceType::kDLVPI, "vpi"},
+      std::pair{DLDeviceType::kDLROCM, "rocm"},
+      std::pair{DLDeviceType::kDLROCMHost, "rocm_host"},
+      std::pair{DLDeviceType::kDLExtDev, "ext_dev"},
+      std::pair{DLDeviceType::kDLCUDAManaged, "cuda_managed"},
+      std::pair{DLDeviceType::kDLOneAPI, "oneapi"},
+      std::pair{DLDeviceType::kDLWebGPU, "webgpu"},
+      std::pair{DLDeviceType::kDLHexagon, "hexagon"},
+      std::pair{DLDeviceType::kDLMAIA, "maia"},
+      std::pair{DLDeviceType::kDLTrn, "trn"},
+  };
+  constexpr auto max_type = stdr::max(map | stdv::keys);
+  auto result = std::array<std::string_view, max_type + 1>{};
+  for (const auto& [code, name] : map) {
+    result[static_cast<std::size_t>(code)] = name;
+  }
+  return result;
+}();
+struct PrintableDevice {
+  DLDevice device;
+};
+inline auto& operator<<(std::ostream& os, DLDevice device) {
+  const auto& mapping = kDeviceStringMap;
+  const auto entry = static_cast<std::size_t>(device.device_type);
+  host::RuntimeCheck(entry < mapping.size());
+  const auto name = mapping[entry];
+  host::RuntimeCheck(!name.empty(), "Unknown device: ", int(device.device_type));
+  os << name;
+  if (device.device_id != kAnyDeviceID) os << "[" << device.device_id << "]";
+  return os;
+}
+inline auto& operator<<(std::ostream& os, PrintableDevice pd) {
+  return os << pd.device;
+}
+template <typename T>
+inline auto& operator<<(std::ostream& os, PrintAbleSpan<T> span) {
+  os << "[";
+  for (const auto i : stdv::iota(std::size_t{0}, span.data.size())) {
+    if (i > 0) {
+      os << ", ";
+    }
+    os << span.data[i];
+  }
+  os << "]";
+  return os;
+}
+}  // namespace details
+struct SymbolicSize {
+ public:
+  SymbolicSize(std::string_view annotation = {}) : m_value(details::kNullSize), m_annotation(annotation) {}
+  auto get_name() const -> std::string_view {
+    return m_annotation;
+  }
+  auto set_value(int64_t value) -> void {
+    host::RuntimeCheck(!this->has_value(), "Size value already set");
+    m_value = value;
+  }
+  auto has_value() const -> bool {
+    return m_value != details::kNullSize;
+  }
+  auto get_value() const -> std::optional<int64_t> {
+    return this->has_value() ? std::optional{m_value} : std::nullopt;
+  }
+  auto unwrap() const -> int64_t {
+    host::RuntimeCheck(this->has_value(), "Size value is not set");
+    return m_value;
+  }
+  SymbolicSize(const SymbolicSize&) = delete;
+  SymbolicSize& operator=(const SymbolicSize&) = delete;
+  auto verify(int64_t dim) -> void {
+    if (this->has_value()) {
+      host::RuntimeCheck(m_value == dim, "Size mismatch: expected ", m_value, " but got ", dim);
+    } else {
+      this->set_value(dim);
+    }
+  }
+ private:
+  std::int64_t m_value;
+  std::string_view m_annotation;
+};
+inline auto operator==(DLDevice lhs, DLDevice rhs) -> bool {
+  return lhs.device_type == rhs.device_type && lhs.device_id == rhs.device_id;
+}
+struct SymbolicDType {
+ public:
+  SymbolicDType() : m_value({details::kNullDType, 0, 0}) {}
+  auto set_value(DLDataType value) -> void {
+    host::RuntimeCheck(!this->has_value(), "Dtype value already set");
+    host::RuntimeCheck(
+        m_check(value), "Dtype value [", value, "] not in the allowed options: ", details::PrintAbleSpan{m_options});
+    m_value = value;
+  }
+  auto has_value() const -> bool {
+    return m_value.code != details::kNullDType;
+  }
+  auto get_value() const -> std::optional<DLDataType> {
+    return this->has_value() ? std::optional{m_value} : std::nullopt;
+  }
+  auto unwrap() const -> DLDataType {
+    host::RuntimeCheck(this->has_value(), "Dtype value is not set");
+    return m_value;
+  }
+  auto set_options(std::span<const DLDataType> options) -> void {
+    m_options = options;
+  }
+  template <typename... Ts>
+  auto set_options() -> void {
+    m_options = details::kDTypeList<Ts...>;
+  }
+  auto verify(DLDataType dtype) -> void {
+    if (this->has_value()) {
+      host::RuntimeCheck(m_value == dtype, "DType mismatch: expected ", m_value, " but got ", dtype);
+    } else {
+      this->set_value(dtype);
+    }
+  }
+ private:
+  auto m_check(DLDataType value) const -> bool {
+    return stdr::empty(m_options) || (stdr::find(m_options, value) != stdr::end(m_options));
+  }
+  std::span<const DLDataType> m_options;
+  DLDataType m_value;
+};
+struct SymbolicDevice {
+ public:
+  SymbolicDevice() : m_value({details::kNullDevice, details::kAnyDeviceID}) {}
+  auto set_value(DLDevice value) -> void {
+    host::RuntimeCheck(!this->has_value(), "Device value already set");
+    host::RuntimeCheck(
+        m_check(value),
+        "Device value [",
+        details::PrintableDevice{value},
+        "] not in the allowed options: ",
+        details::PrintAbleSpan{m_options});
+    m_value = value;
+  }
+  auto has_value() const -> bool {
+    return m_value.device_type != details::kNullDevice;
+  }
+  auto get_value() const -> std::optional<DLDevice> {
+    return this->has_value() ? std::optional{m_value} : std::nullopt;
+  }
+  auto unwrap() const -> DLDevice {
+    host::RuntimeCheck(this->has_value(), "Device value is not set");
+    return m_value;
+  }
+  auto set_options(std::span<const DLDevice> options) -> void {
+    m_options = options;
+  }
+  template <DLDeviceType... Codes>
+  auto set_options() -> void {
+    m_options = details::kDeviceList<Codes...>;
+  }
+  auto verify(DLDevice device) -> void {
+    if (this->has_value()) {
+      host::RuntimeCheck(
+          m_value == device,
+          "Device mismatch: expected ",
+          details::PrintableDevice{m_value},
+          " but got ",
+          details::PrintableDevice{device});
+    } else {
+      this->set_value(device);
+    }
+  }
+ private:
+  auto m_check(DLDevice value) const -> bool {
+    return stdr::empty(m_options) || (stdr::any_of(m_options, [value](const DLDevice& opt) {
+             // device type must exactly match
+             if (opt.device_type != value.device_type) return false;
+             // device id can be wildcarded
+             return opt.device_id == details::kAnyDeviceID || opt.device_id == value.device_id;
+           }));
+  }
+  std::span<const DLDevice> m_options;
+  DLDevice m_value;
+};
+namespace details {
+template <typename T>
+struct BaseRef {
+ public:
+  BaseRef(const BaseRef&) = delete;
+  BaseRef& operator=(const BaseRef&) = delete;
+  auto operator->() const -> T* {
+    return m_ref;
+  }
+  auto operator*() const -> T& {
+    return *m_ref;
+  }
+  auto rebind(T& other) -> void {
+    m_ref = &other;
+  }
+  explicit BaseRef() : m_ref(&m_cache), m_cache() {}
+  BaseRef(T& size) : m_ref(&size), m_cache() {}
+ private:
+  T* m_ref;
+  T m_cache;
+};
+struct SizeRef : BaseRef<SymbolicSize> {
+  using BaseRef::BaseRef;
+  SizeRef(int64_t value) {
+    if (value != kAnySize) {
+      (**this).set_value(value);
+    } else {
+      // otherwise, we can match any size
+    }
+  }
+  auto value_or_name(std::size_t dim) const -> std::string {
+    if (const auto value = (**this).get_value()) {
+      return std::to_string(*value);
+    } else {
+      const auto annotation = (**this).get_name();
+      if (annotation.empty()) {
+        return "dim#" + std::to_string(dim);
+      } else {
+        return static_cast<std::string>(annotation);
+      }
+    }
+  }
+};
+struct DTypeRef : BaseRef<SymbolicDType> {
+  using BaseRef::BaseRef;
+  DTypeRef(DLDataType options) {
+    (**this).set_value(options);
+  }
+  DTypeRef(std::initializer_list<DLDataType> options) {
+    (**this).set_options(options);
+  }
+  DTypeRef(std::span<const DLDataType> options) {
+    (**this).set_options(options);
+  }
+};
+struct DeviceRef : BaseRef<SymbolicDevice> {
+  using BaseRef::BaseRef;
+  DeviceRef(DLDevice options) {
+    (**this).set_value(options);
+  }
+  DeviceRef(std::initializer_list<DLDevice> options) {
+    (**this).set_options(options);
+  }
+  DeviceRef(std::span<const DLDevice> options) {
+    (**this).set_options(options);
+  }
+};
+}  // namespace details
+struct TensorMatcher {
+ private:
+  using SizeRef = details::SizeRef;
+  using DTypeRef = details::DTypeRef;
+  using DeviceRef = details::DeviceRef;
+  using Loc_t = std::source_location;
+ public:
+  TensorMatcher(const TensorMatcher&) = delete;
+  TensorMatcher& operator=(const TensorMatcher&) = delete;
+  explicit TensorMatcher(std::initializer_list<SizeRef> shape) : m_shape(shape), m_strides(), m_dtype() {}
+  auto with_strides(std::initializer_list<SizeRef> strides) && -> TensorMatcher&& {
+    // no partial update allowed
+    host::RuntimeCheck(m_strides.size() == 0, "Strides already specified");
+    host::RuntimeCheck(m_shape.size() == strides.size(), "Strides size must match shape size");
+    m_strides = strides;
+    return std::move(*this);
+  }
+  template <typename... Ts>
+  auto with_dtype(DTypeRef&& dtype) && -> TensorMatcher&& {
+    m_init_dtype();
+    m_dtype.rebind(*dtype);
+    return std::move(*this);
+  }
+  template <typename... Ts>
+  auto with_dtype() && -> TensorMatcher&& {
+    static_assert(sizeof...(Ts) > 0, "At least one dtype option must be specified");
+    m_init_dtype();
+    m_dtype->set_options<Ts...>();
+    return std::move(*this);
+  }
+  template <DLDeviceType... Codes>
+  auto with_device(DeviceRef&& device) && -> TensorMatcher&& {
+    m_init_device();
+    m_device.rebind(*device);
+    return std::move(*this);
+  }
+  template <DLDeviceType... Codes>
+  auto with_device() && -> TensorMatcher&& {
+    static_assert(sizeof...(Codes) > 0, "At least one device option must be specified");
+    m_init_device();
+    m_device->set_options<Codes...>();
+    return std::move(*this);
+  }
+  // once we start verification, we cannot modify anymore
+  auto verify(tvm::ffi::TensorView view, Loc_t loc = Loc_t::current()) const&& -> const TensorMatcher&& {
+    try {
+      this->m_verify_impl(view);
+    } catch (PanicError& e) {
+      auto oss = std::ostringstream{};
+      oss << "Tensor match failed for " << this->debug_str() << " at " << loc.file_name() << ":" << loc.line()
+          << "\n- Root cause:  " << e.detail();
+      throw PanicError(std::move(oss).str());
+    }
+    return std::move(*this);
+  }
+  auto debug_str() const -> std::string {
+    auto oss = std::ostringstream{};
+    oss << "Tensor<";
+    std::size_t dim = 0;
+    for (const auto& size_ref : m_shape) {
+      if (dim > 0) {
+        oss << ", ";
+      }
+      oss << size_ref.value_or_name(dim++);
+    }
+    oss << ">";
+    if (m_strides.size() > 0) {
+      oss << " [strides=<";
+      dim = 0;
+      for (const auto& stride_ref : m_strides) {
+        if (dim > 0) {
+          oss << ", ";
+        }
+        oss << stride_ref.value_or_name(dim++);
+      }
+      oss << ">]";
+    }
+    return std::move(oss).str();
+  }
+ private:
+  auto m_verify_impl(tvm::ffi::TensorView view) const -> void {
+    const auto dim = static_cast<std::size_t>(view.dim());
+    host::RuntimeCheck(dim == m_shape.size(), "Tensor dimension mismatch: expected ", m_shape.size(), " but got ", dim);
+    for (const auto i : stdv::iota(std::size_t{0}, dim)) {
+      m_shape[i]->verify(view.size(i));
+    }
+    if (this->m_has_strides()) {
+      for (const auto i : stdv::iota(std::size_t{0}, dim)) {
+        m_strides[i]->verify(view.stride(i));
+      }
+    } else {
+      host::RuntimeCheck(view.is_contiguous(), "Tensor is not contiguous as expected");
+    }
+    // since we may use the same matcher to verify again, we will force to check
+    m_dtype->verify(view.dtype());
+    m_device->verify(view.device());
+  }
+  auto m_init_dtype() -> void {
+    host::RuntimeCheck(!m_has_dtype, "DType already specified");
+    m_has_dtype = true;
+  }
+  auto m_init_device() -> void {
+    host::RuntimeCheck(!m_has_device, "Device already specified");
+    m_has_device = true;
+  }
+  auto m_has_strides() const -> bool {
+    return !m_strides.empty();
+  }
+  std::span<const SizeRef> m_shape;
+  std::span<const SizeRef> m_strides;
+  DTypeRef m_dtype;
+  DeviceRef m_device;
+  bool m_has_dtype = false;
+  bool m_has_device = false;
+};
+}  // namespace host

sglang/jit_kernel/include/sgl_kernel/utils.cuh ADDED Viewed

	@@ -0,0 +1,101 @@

+#pragma once
+#include <sgl_kernel/utils.h>
+#include <dlpack/dlpack.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <concepts>
+#include <cstddef>
+#include <source_location>
+#include <type_traits>
+namespace device {
+inline constexpr auto kWarpThreads = 32u;
+namespace pointer {
+// we only allow void * pointer arithmetic for safety
+template <typename T, std::integral... U>
+__always_inline __device__ auto offset(T* ptr, U... offset) -> void* {
+  static_assert(std::is_same_v<T, void>, "Pointer arithmetic is only allowed for void* pointers");
+  return static_cast<char*>(ptr) + (... + offset);
+}
+template <typename T, std::integral... U>
+__always_inline __device__ auto offset(const T* ptr, U... offset) -> const void* {
+  static_assert(std::is_same_v<T, void>, "Pointer arithmetic is only allowed for void* pointers");
+  return static_cast<const char*>(ptr) + (... + offset);
+}
+}  // namespace pointer
+}  // namespace device
+namespace host {
+inline auto
+RuntimeDeviceCheck(::cudaError_t error, std::source_location location = std::source_location::current()) -> void {
+  if (error != ::cudaSuccess) {
+    [[unlikely]];
+    ::host::panic(location, "CUDA error: ", ::cudaGetErrorString(error));
+  }
+}
+inline auto RuntimeCudaCheck(std::source_location location = std::source_location::current()) -> void {
+  return RuntimeDeviceCheck(::cudaGetLastError(), location);
+}
+template <auto F>
+inline void set_smem_once(std::size_t smem_size) {
+  static const auto last_smem_size = [&] {
+    RuntimeDeviceCheck(::cudaFuncSetAttribute(F, ::cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    return smem_size;
+  }();
+  RuntimeCheck(
+      smem_size <= last_smem_size,
+      "Dynamic shared memory size exceeds the previously set maximum size: ",
+      last_smem_size,
+      " bytes");
+}
+struct LaunchKernel {
+ public:
+  explicit LaunchKernel(
+      dim3 grid_dim, dim3 block_dim, DLDevice device, std::size_t dynamic_shared_mem_bytes = 0) noexcept
+      : m_config(s_make_config(grid_dim, block_dim, resolve_device(device), dynamic_shared_mem_bytes)) {}
+  explicit LaunchKernel(
+      dim3 grid_dim, dim3 block_dim, cudaStream_t stream, std::size_t dynamic_shared_mem_bytes = 0) noexcept
+      : m_config(s_make_config(grid_dim, block_dim, stream, dynamic_shared_mem_bytes)) {}
+  static auto resolve_device(DLDevice device) -> cudaStream_t {
+    return static_cast<cudaStream_t>(::TVMFFIEnvGetStream(device.device_type, device.device_id));
+  }
+  LaunchKernel(const LaunchKernel&) = delete;
+  LaunchKernel& operator=(const LaunchKernel&) = delete;
+  template <typename T, typename... Args>
+  auto operator()(T&& kernel, Args&&... args) const -> void {
+    host::RuntimeDeviceCheck(::cudaLaunchKernelEx(&m_config, kernel, std::forward<Args>(args)...));
+  }
+ private:
+  static auto
+  s_make_config(dim3 grid_dim, dim3 block_dim, cudaStream_t stream, std::size_t smem) -> cudaLaunchConfig_t {
+    auto config = ::cudaLaunchConfig_t{};
+    config.gridDim = grid_dim;
+    config.blockDim = block_dim;
+    config.dynamicSmemBytes = smem;
+    config.stream = stream;
+    config.numAttrs = 0;
+    return config;
+  }
+  cudaLaunchConfig_t m_config;
+  /// TODO: We can add a queue to store the attributes if needed in the future.
+};
+}  // namespace host

sglang/jit_kernel/include/sgl_kernel/utils.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#pragma once
+#include <dlpack/dlpack.h>
+#include <concepts>
+#include <ostream>
+#include <source_location>
+#include <sstream>
+#include <utility>
+namespace host {
+struct PanicError : public std::runtime_error {
+ public:
+  // copy and move constructors
+  explicit PanicError(std::string msg) : runtime_error(msg), m_message(std::move(msg)) {}
+  auto detail() const -> std::string_view {
+    const auto sv = std::string_view{m_message};
+    const auto pos = sv.find(": ");
+    return pos == std::string_view::npos ? sv : sv.substr(pos + 2);
+  }
+ private:
+  std::string m_message;
+};
+template <typename... Args>
+[[noreturn]]
+inline auto panic(std::source_location location, Args&&... args) -> void {
+  std::ostringstream os;
+  os << "Runtime check failed at " << location.file_name() << ":" << location.line();
+  if constexpr (sizeof...(args) > 0) {
+    os << ": ";
+    (os << ... << std::forward<Args>(args));
+  } else {
+    os << " in " << location.function_name();
+  }
+  throw PanicError(std::move(os).str());
+}
+template <typename... Args>
+struct RuntimeCheck {
+  using Loc_t = std::source_location;
+  template <typename Cond>
+  explicit RuntimeCheck(Cond&& condition, Args&&... args, Loc_t location = Loc_t::current()) {
+    if (!condition) {
+      [[unlikely]];
+      ::host::panic(location, std::forward<Args>(args)...);
+    }
+  }
+};
+template <typename Cond, typename... Args>
+explicit RuntimeCheck(Cond&&, Args&&...) -> RuntimeCheck<Args...>;
+template <std::signed_integral T, std::signed_integral U>
+inline constexpr auto div_ceil(T a, U b) {
+  return (a + b - 1) / b;
+}
+template <std::unsigned_integral T, std::unsigned_integral U>
+inline constexpr auto div_ceil(T a, U b) {
+  return (a + b - 1) / b;
+}
+inline auto dtype_bytes(DLDataType dtype) -> std::size_t {
+  return static_cast<std::size_t>(dtype.bits / 8);
+}
+namespace pointer {
+// we only allow void * pointer arithmetic for safety
+template <typename T, std::integral... U>
+inline auto offset(T* ptr, U... offset) -> void* {
+  static_assert(std::is_same_v<T, void>, "Pointer arithmetic is only allowed for void* pointers");
+  return static_cast<char*>(ptr) + (... + offset);
+}
+template <typename T, std::integral... U>
+inline auto offset(const T* ptr, U... offset) -> const void* {
+  static_assert(std::is_same_v<T, void>, "Pointer arithmetic is only allowed for void* pointers");
+  return static_cast<const char*>(ptr) + (... + offset);
+}
+}  // namespace pointer
+}  // namespace host

sglang/jit_kernel/include/sgl_kernel/warp.cuh ADDED Viewed

	@@ -0,0 +1,145 @@

+#pragma once
+#include <sgl_kernel/utils.cuh>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+namespace device::warp {
+namespace details {
+template <std::size_t kUnit>
+inline constexpr auto get_mem_package() {
+  if constexpr (kUnit == 16) {
+    return uint4{};
+  } else if constexpr (kUnit == 8) {
+    return uint2{};
+  } else if constexpr (kUnit == 4) {
+    return uint1{};
+  } else {
+    static_assert(kUnit == 16 || kUnit == 8 || kUnit == 4, "Unsupported memory package size");
+  }
+}
+inline constexpr auto default_unit_size(std::size_t x) -> std::size_t {
+  if (x % (16 * kWarpThreads) == 0) return 16;
+  if (x % (8 * kWarpThreads) == 0) return 8;
+  if (x % (4 * kWarpThreads) == 0) return 4;
+  return 0;  // trigger static assert in _get_mem_package
+}
+template <std::size_t kBytes, std::size_t kUnit>
+using mem_package_t = decltype(get_mem_package<kUnit>());
+template <typename T, std::size_t N>
+struct storage_vec {
+  T data[N];
+};
+__always_inline __device__ auto load_nc(const uint1* __restrict__ src) -> uint1 {
+  uint32_t tmp;
+  asm volatile("ld.global.cs.b32 %0,[%1];" : "=r"(tmp) : "l"(src));
+  return uint1{tmp};
+}
+__always_inline __device__ auto load_nc(const uint2* __restrict__ src) -> uint2 {
+  uint32_t tmp0, tmp1;
+  asm volatile("ld.global.cs.v2.b32 {%0,%1},[%2];" : "=r"(tmp0), "=r"(tmp1) : "l"(src));
+  return uint2{tmp0, tmp1};
+}
+__always_inline __device__ auto load_nc(const uint4* __restrict__ src) -> uint4 {
+  uint32_t tmp0, tmp1, tmp2, tmp3;
+  asm volatile("ld.global.cs.v4.b32 {%0,%1,%2,%3},[%4];" : "=r"(tmp0), "=r"(tmp1), "=r"(tmp2), "=r"(tmp3) : "l"(src));
+  return uint4{tmp0, tmp1, tmp2, tmp3};
+}
+__always_inline __device__ void store_nc(uint1* __restrict__ dst, const uint1& value) {
+  uint32_t tmp = value.x;
+  asm volatile("st.global.cs.b32 [%0],%1;" ::"l"(dst), "r"(tmp));
+}
+__always_inline __device__ void store_nc(uint2* __restrict__ dst, const uint2& value) {
+  uint32_t tmp0 = value.x;
+  uint32_t tmp1 = value.y;
+  asm volatile("st.global.cs.v2.b32 [%0],{%1,%2};" ::"l"(dst), "r"(tmp0), "r"(tmp1));
+}
+__always_inline __device__ void store_nc(uint4* __restrict__ dst, const uint4& value) {
+  uint32_t tmp0 = value.x;
+  uint32_t tmp1 = value.y;
+  uint32_t tmp2 = value.z;
+  uint32_t tmp3 = value.w;
+  asm volatile("st.global.cs.v4.b32 [%0],{%1,%2,%3,%4};" ::"l"(dst), "r"(tmp0), "r"(tmp1), "r"(tmp2), "r"(tmp3));
+}
+}  // namespace details
+template <
+    std::size_t kBytes,
+    std::size_t kUnit = details::default_unit_size(kBytes),
+    std::size_t kThreads = ::device::kWarpThreads>
+__always_inline __device__ void copy(void* __restrict__ dst, const void* __restrict__ src) {
+  using Package = details::mem_package_t<kBytes, kUnit>;
+  constexpr auto kBytesPerLoop = sizeof(Package) * kThreads;
+  constexpr auto kLoopCount = kBytes / kBytesPerLoop;
+  static_assert(kBytes % kBytesPerLoop == 0, "kBytes must be multiple of 128 bytes");
+  const auto dst_packed = static_cast<Package*>(dst);
+  const auto src_packed = static_cast<const Package*>(src);
+  const auto lane_id = threadIdx.x % kThreads;
+#pragma unroll kLoopCount
+  for (std::size_t i = 0; i < kLoopCount; ++i) {
+    const auto j = i * kThreads + lane_id;
+    dst_packed[j] = src_packed[j];
+  }
+}
+template <
+    std::size_t kBytes,
+    std::size_t kUnit = details::default_unit_size(kBytes),
+    std::size_t kThreads = ::device::kWarpThreads>
+__always_inline __device__ auto load_vec(const void* __restrict__ src) {
+  using Package = details::mem_package_t<kBytes, kUnit>;
+  constexpr auto kBytesPerLoop = sizeof(Package) * kThreads;
+  constexpr auto kLoopCount = kBytes / kBytesPerLoop;
+  static_assert(kBytes % kBytesPerLoop == 0, "kBytes must be multiple of 128 bytes");
+  const auto src_packed = static_cast<const Package*>(src);
+  const auto lane_id = threadIdx.x % kThreads;
+  details::storage_vec<Package, kLoopCount> vec;
+#pragma unroll kLoopCount
+  for (std::size_t i = 0; i < kLoopCount; ++i) {
+    const auto j = i * kThreads + lane_id;
+    vec.data[i] = details::load_nc(src_packed + j);
+  }
+  return vec;
+}
+template <
+    std::size_t kBytes,
+    std::size_t kUnit = details::default_unit_size(kBytes),
+    std::size_t kThreads = ::device::kWarpThreads,
+    typename Tp>
+__always_inline __device__ void store_vec(void* __restrict__ dst, const Tp& vec) {
+  using Package = details::mem_package_t<kBytes, kUnit>;
+  constexpr auto kBytesPerLoop = sizeof(Package) * kThreads;
+  constexpr auto kLoopCount = kBytes / kBytesPerLoop;
+  static_assert(kBytes % kBytesPerLoop == 0, "kBytes must be multiple of 128 bytes");
+  static_assert(std::is_same_v<Tp, details::storage_vec<Package, kLoopCount>>);
+  const auto dst_packed = static_cast<Package*>(dst);
+  const auto lane_id = threadIdx.x % kThreads;
+#pragma unroll kLoopCount
+  for (std::size_t i = 0; i < kLoopCount; ++i) {
+    const auto j = i * kThreads + lane_id;
+    details::store_nc(dst_packed + j, vec.data[i]);
+  }
+}
+}  // namespace device::warp

sglang/jit_kernel/utils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from __future__ import annotations
+import pathlib
+from functools import lru_cache
+from typing import TYPE_CHECKING, List, Tuple, TypeAlias, Union
+if TYPE_CHECKING:
+    from tvm_ffi import Module
+def _make_wrapper(tup: Tuple[str, str]) -> str:
+    export_name, kernel_name = tup
+    return f"TVM_FFI_DLL_EXPORT_TYPED_FUNC({export_name}, ({kernel_name}));"
+@lru_cache()
+def _resolve_kernel_path() -> pathlib.Path:
+    cur_dir = pathlib.Path(__file__).parent.resolve()
+    # first, try this directory structure
+    def _environment_install():
+        candidate = cur_dir.resolve()
+        if (candidate / "include").exists() and (candidate / "csrc").exists():
+            return candidate
+        return None
+    def _package_install():
+        # TODO: support find path by package
+        return None
+    path = _environment_install() or _package_install()
+    if path is None:
+        raise RuntimeError("Cannot find sgl-kernel/jit path")
+    return path
+KERNEL_PATH = _resolve_kernel_path()
+DEFAULT_INCLUDE = [str(KERNEL_PATH / "include")]
+DEFAULT_CFLAGS = ["-std=c++20", "-O3"]
+DEFAULT_CUDA_CFLAGS = ["-std=c++20", "-O3", "--expt-relaxed-constexpr"]
+DEFAULT_LDFLAGS = []
+CPP_TEMPLATE_TYPE: TypeAlias = Union[int, float, bool]
+class CPPArgList(list[str]):
+    def __str__(self) -> str:
+        return ", ".join(self)
+def make_cpp_args(*args: CPP_TEMPLATE_TYPE) -> CPPArgList:
+    def _convert(arg: CPP_TEMPLATE_TYPE) -> str:
+        if isinstance(arg, bool):
+            return "true" if arg else "false"
+        if isinstance(arg, (int, float)):
+            return str(arg)
+        raise TypeError(f"Unsupported argument type for cpp template: {type(arg)}")
+    return CPPArgList(_convert(arg) for arg in args)
+def load_jit(
+    *args: str,
+    cpp_files: List[str] | None = None,
+    cuda_files: List[str] | None = None,
+    cpp_wrappers: List[Tuple[str, str]] | None = None,
+    cuda_wrappers: List[Tuple[str, str]] | None = None,
+    extra_cflags: List[str] | None = None,
+    extra_cuda_cflags: List[str] | None = None,
+    extra_ldflags: List[str] | None = None,
+    extra_include_paths: List[str] | None = None,
+    build_directory: str | None = None,
+) -> Module:
+    from tvm_ffi.cpp import load_inline
+    cpp_files = cpp_files or []
+    cuda_files = cuda_files or []
+    cpp_wrappers = cpp_wrappers or []
+    cuda_wrappers = cuda_wrappers or []
+    extra_cflags = extra_cflags or []
+    extra_cuda_cflags = extra_cuda_cflags or []
+    extra_ldflags = extra_ldflags or []
+    extra_include_paths = extra_include_paths or []
+    # include cpp files
+    cpp_paths = [(KERNEL_PATH / "csrc" / f).resolve() for f in cpp_files]
+    cpp_sources = [f'#include "{path}"' for path in cpp_paths]
+    cpp_sources += [_make_wrapper(tup) for tup in cpp_wrappers]
+    # include cuda files
+    cuda_paths = [(KERNEL_PATH / "csrc" / f).resolve() for f in cuda_files]
+    cuda_sources = [f'#include "{path}"' for path in cuda_paths]
+    cuda_sources += [_make_wrapper(tup) for tup in cuda_wrappers]
+    return load_inline(
+        "sgl_kernel_jit_" + "_".join(str(arg) for arg in args),
+        cpp_sources=cpp_sources,
+        cuda_sources=cuda_sources,
+        extra_cflags=DEFAULT_CFLAGS + extra_cflags,
+        extra_cuda_cflags=DEFAULT_CUDA_CFLAGS + extra_cuda_cflags,
+        extra_ldflags=DEFAULT_LDFLAGS + extra_ldflags,
+        extra_include_paths=DEFAULT_INCLUDE + extra_include_paths,
+        build_directory=build_directory,
+    )

sglang/lang/__pycache__/api.cpython-311.pyc ADDED Viewed

Binary file (10.3 kB). View file

sglang/lang/__pycache__/chat_template.cpython-311.pyc ADDED Viewed

Binary file (19.4 kB). View file

sglang/lang/__pycache__/choices.cpython-311.pyc ADDED Viewed

Binary file (9.41 kB). View file

sglang/lang/__pycache__/interpreter.cpython-311.pyc ADDED Viewed

Binary file (50 kB). View file

sglang/lang/__pycache__/ir.cpython-311.pyc ADDED Viewed

Binary file (33.4 kB). View file

sglang/lang/api.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""Public APIs of the language."""
+import re
+from typing import Callable, List, Optional, Union
+from sglang.global_config import global_config
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
+from sglang.lang.ir import (
+    SglExpr,
+    SglExprList,
+    SglFunction,
+    SglGen,
+    SglImage,
+    SglRoleBegin,
+    SglRoleEnd,
+    SglSelect,
+    SglSeparateReasoning,
+    SglVideo,
+)
+def function(
+    func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
+):
+    if func:
+        return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
+    def decorator(func):
+        return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
+    return decorator
+def Runtime(*args, **kwargs):
+    # Avoid importing unnecessary dependency
+    from sglang.lang.backend.runtime_endpoint import Runtime
+    return Runtime(*args, **kwargs)
+def Engine(*args, **kwargs):
+    # Avoid importing unnecessary dependency
+    from sglang.srt.entrypoints.engine import Engine
+    return Engine(*args, **kwargs)
+def set_default_backend(backend: BaseBackend):
+    global_config.default_backend = backend
+def flush_cache(backend: Optional[BaseBackend] = None):
+    backend = backend or global_config.default_backend
+    if backend is None:
+        return False
+    # If backend is Runtime
+    if hasattr(backend, "endpoint"):
+        backend = backend.endpoint
+    return backend.flush_cache()
+def get_server_info(backend: Optional[BaseBackend] = None):
+    backend = backend or global_config.default_backend
+    if backend is None:
+        return None
+    # If backend is Runtime
+    if hasattr(backend, "endpoint"):
+        backend = backend.endpoint
+    return backend.get_server_info()
+def gen(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    min_tokens: Optional[int] = None,
+    n: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
+    stop_regex: Optional[Union[str, List[str]]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
+    dtype: Optional[Union[type, str]] = None,
+    choices: Optional[List[str]] = None,
+    choices_method: Optional[ChoicesSamplingMethod] = None,
+    regex: Optional[str] = None,
+    json_schema: Optional[str] = None,
+):
+    """Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
+    if choices:
+        return SglSelect(
+            name,
+            choices,
+            0.0 if temperature is None else temperature,
+            token_length_normalized if choices_method is None else choices_method,
+        )
+    # check regex is valid
+    if regex is not None:
+        try:
+            re.compile(regex)
+        except re.error as e:
+            raise e
+    return SglGen(
+        name,
+        max_tokens,
+        min_tokens,
+        n,
+        stop,
+        stop_token_ids,
+        stop_regex,
+        temperature,
+        top_p,
+        top_k,
+        min_p,
+        frequency_penalty,
+        presence_penalty,
+        ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
+        dtype,
+        regex,
+        json_schema,
+    )
+def gen_int(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    n: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
+    stop_regex: Optional[Union[str, List[str]]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
+):
+    return SglGen(
+        name,
+        max_tokens,
+        None,
+        n,
+        stop,
+        stop_token_ids,
+        stop_regex,
+        temperature,
+        top_p,
+        top_k,
+        min_p,
+        frequency_penalty,
+        presence_penalty,
+        ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
+        int,
+        None,
+    )
+def gen_string(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    n: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
+    stop_regex: Optional[Union[str, List[str]]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
+):
+    return SglGen(
+        name,
+        max_tokens,
+        None,
+        n,
+        stop,
+        stop_token_ids,
+        stop_regex,
+        temperature,
+        top_p,
+        top_k,
+        min_p,
+        frequency_penalty,
+        presence_penalty,
+        ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
+        str,
+        None,
+    )
+def image(expr: SglExpr):
+    return SglImage(expr)
+def video(path: str, num_frames: int):
+    return SglVideo(path, num_frames)
+def select(
+    name: Optional[str] = None,
+    choices: Optional[List[str]] = None,
+    temperature: float = 0.0,
+    choices_method: ChoicesSamplingMethod = token_length_normalized,
+):
+    assert choices is not None
+    return SglSelect(name, choices, temperature, choices_method)
+def _role_common(name: str, expr: Optional[SglExpr] = None):
+    if expr is None:
+        return SglExprList([SglRoleBegin(name), SglRoleEnd(name)])
+    else:
+        return SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
+def system(expr: Optional[SglExpr] = None):
+    return _role_common("system", expr)
+def user(expr: Optional[SglExpr] = None):
+    return _role_common("user", expr)
+def assistant(expr: Optional[SglExpr] = None):
+    return _role_common("assistant", expr)
+def system_begin():
+    return SglRoleBegin("system")
+def system_end():
+    return SglRoleEnd("system")
+def user_begin():
+    return SglRoleBegin("user")
+def user_end():
+    return SglRoleEnd("user")
+def assistant_begin():
+    return SglRoleBegin("assistant")
+def assistant_end():
+    return SglRoleEnd("assistant")
+def separate_reasoning(
+    expr: Optional[SglExpr] = None, model_type: Optional[str] = None
+):
+    return SglExprList([expr, SglSeparateReasoning(model_type, expr=expr)])

sglang/lang/backend/__pycache__/base_backend.cpython-311.pyc ADDED Viewed

Binary file (4.6 kB). View file

sglang/lang/backend/__pycache__/runtime_endpoint.cpython-311.pyc ADDED Viewed

Binary file (25.5 kB). View file

sglang/lang/backend/anthropic.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+try:
+    import anthropic
+except ImportError as e:
+    anthropic = e
+class Anthropic(BaseBackend):
+    def __init__(self, model_name, *args, **kwargs):
+        super().__init__()
+        if isinstance(anthropic, Exception):
+            raise anthropic
+        self.model_name = model_name
+        self.chat_template = get_chat_template("claude")
+        self.client = anthropic.Anthropic(*args, **kwargs)
+    def get_chat_template(self):
+        return self.chat_template
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            messages = s.messages_
+        else:
+            messages = [{"role": "user", "content": s.text_}]
+        if messages and messages[0]["role"] == "system":
+            system = messages.pop(0)["content"]
+        else:
+            system = ""
+        ret = self.client.messages.create(
+            model=self.model_name,
+            system=system,
+            messages=messages,
+            **sampling_params.to_anthropic_kwargs(),
+        )
+        comp = ret.content[0].text
+        return comp, {}
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            messages = s.messages_
+        else:
+            messages = [{"role": "user", "content": s.text_}]
+        if messages and messages[0]["role"] == "system":
+            system = messages.pop(0)["content"]
+        else:
+            system = ""
+        with self.client.messages.stream(
+            model=self.model_name,
+            system=system,
+            messages=messages,
+            **sampling_params.to_anthropic_kwargs(),
+        ) as stream:
+            for text in stream.text_stream:
+                yield text, {}

sglang/lang/backend/base_backend.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from typing import List, Optional, Union
+from sglang.lang.chat_template import get_chat_template
+from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+class BaseBackend:
+    def __init__(self) -> None:
+        self.support_concate_and_append = False
+        self.chat_template = get_chat_template("default")
+    def get_model_name(self):
+        raise NotImplementedError()
+    def get_chat_template(self):
+        return self.chat_template
+    def cache_prefix(self, prefix_str: str):
+        pass
+    def uncache_prefix(self, rid: str):
+        pass
+    def end_request(self, rid: Union[str, List[str]]):
+        pass
+    def begin_program(self, s: StreamExecutor):
+        pass
+    def end_program(self, s: Union[StreamExecutor, List[StreamExecutor]]):
+        pass
+    def commit_lazy_operations(self, s: StreamExecutor):
+        pass
+    def fork_program(
+        self,
+        src: StreamExecutor,
+        dst: List[StreamExecutor],
+        position_ids_offset: Optional[List[int]] = None,
+    ):
+        pass
+    def fill_image(self, s: StreamExecutor):
+        pass
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        raise NotImplementedError()
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        raise NotImplementedError()
+    def select(
+        self,
+        s: StreamExecutor,
+        choices: List[str],
+        temperature: float,
+        choices_method: Optional[ChoicesSamplingMethod] = None,
+    ) -> ChoicesDecision:
+        raise NotImplementedError()
+    def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
+        raise NotImplementedError()
+    def shutdown(self):
+        pass
+    def flush_cache(self):
+        pass
+    def get_server_info(self):
+        pass

sglang/lang/backend/litellm.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from typing import Mapping, Optional
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template_by_model_path
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+try:
+    import litellm
+except ImportError as e:
+    litellm = e
+    litellm.num_retries = 1
+class LiteLLM(BaseBackend):
+    def __init__(
+        self,
+        model_name,
+        chat_template=None,
+        api_key=None,
+        organization: Optional[str] = None,
+        base_url: Optional[str] = None,
+        timeout: Optional[float] = 600,
+        max_retries: Optional[int] = litellm.num_retries,
+        default_headers: Optional[Mapping[str, str]] = None,
+    ):
+        super().__init__()
+        if isinstance(litellm, Exception):
+            raise litellm
+        self.model_name = model_name
+        self.chat_template = chat_template or get_chat_template_by_model_path(
+            model_name
+        )
+        self.client_params = {
+            "api_key": api_key,
+            "organization": organization,
+            "base_url": base_url,
+            "timeout": timeout,
+            "max_retries": max_retries,
+            "default_headers": default_headers,
+        }
+    def get_chat_template(self):
+        return self.chat_template
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            messages = s.messages_
+        else:
+            messages = [{"role": "user", "content": s.text_}]
+        ret = litellm.completion(
+            model=self.model_name,
+            messages=messages,
+            **self.client_params,
+            **sampling_params.to_litellm_kwargs(),
+        )
+        comp = ret.choices[0].message.content
+        return comp, {}
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            messages = s.messages_
+        else:
+            messages = [{"role": "user", "content": s.text_}]
+        ret = litellm.completion(
+            model=self.model_name,
+            messages=messages,
+            stream=True,
+            **self.client_params,
+            **sampling_params.to_litellm_kwargs(),
+        )
+        for chunk in ret:
+            text = chunk.choices[0].delta.content
+            if text is not None:
+                yield text, {}

sglang/lang/backend/openai.py ADDED Viewed

	@@ -0,0 +1,475 @@

+import dataclasses
+import logging
+import time
+import warnings
+from typing import List, Optional, Union
+import numpy as np
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
+from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+try:
+    import openai
+    import tiktoken
+except ImportError as e:
+    openai = tiktoken = e
+logger = logging.getLogger(__name__)
+def create_logit_bias_int(tokenizer):
+    """Get logit bias for integer numbers."""
+    int_token_ids = []
+    tokens = tokenizer._mergeable_ranks
+    for token, token_id in tokens.items():
+        s = tokenizer.decode([token_id])
+        if all([c.isdigit() for c in s]) or s in [" "]:
+            int_token_ids.append(token_id)
+            if len(int_token_ids) >= 300:  # OpenAI API limit
+                break
+    special_tokens = tokenizer._special_tokens
+    mask = {t: 100 for t in int_token_ids[:299]}
+    mask[special_tokens["<|endoftext|>"]] = 100
+    return mask
+INSTRUCT_MODEL_NAMES = [
+    "gpt-3.5-turbo-instruct",
+]
+@dataclasses.dataclass
+class TokenUsage:
+    prompt_tokens: int
+    completion_tokens: int
+    def reset(self):
+        self.prompt_tokens = self.completion_tokens = 0
+class OpenAI(BaseBackend):
+    def __init__(
+        self,
+        model_name: str,
+        is_chat_model: Optional[bool] = None,
+        chat_template: Optional[ChatTemplate] = None,
+        is_azure: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        if isinstance(openai, Exception):
+            raise openai
+        if is_azure:
+            self.client = openai.AzureOpenAI(*args, **kwargs)
+        else:
+            self.client = openai.OpenAI(*args, **kwargs)
+        self.model_name = model_name
+        try:
+            self.tokenizer = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            self.tokenizer = tiktoken.get_encoding("cl100k_base")
+        self.logit_bias_int = create_logit_bias_int(self.tokenizer)
+        self.chat_template = chat_template or get_chat_template_by_model_path(
+            model_name
+        )
+        if is_chat_model is not None:
+            self.is_chat_model = is_chat_model
+        else:
+            if model_name in INSTRUCT_MODEL_NAMES:
+                self.is_chat_model = False
+            else:
+                self.is_chat_model = True
+        self.chat_prefix = self.chat_template.role_prefix_and_suffix["assistant"][0]
+        # Usage
+        self.token_usage = TokenUsage(0, 0)
+        # API speculative execution
+        # TODO(ying): This does not support multi-threading (run_batch)
+        self.spec_kwargs = {}
+        self.spec_format = []
+        self.spec_max_num_tries = 3
+    def get_chat_template(self):
+        return self.chat_template
+    def _prepare_spec_execution(
+        self,
+        sampling_params: SglSamplingParams,
+        num_api_spec_tokens: int,
+        spec_var_name: str,
+    ):
+        if "max_tokens" not in self.spec_kwargs:
+            self.spec_kwargs["max_tokens"] = num_api_spec_tokens
+        else:
+            assert self.spec_kwargs["max_tokens"] == num_api_spec_tokens
+        params = sampling_params.to_openai_kwargs()
+        for key, value in params.items():
+            if key in ["stop"]:
+                continue
+            if key in ["max_tokens"]:
+                warnings.warn(
+                    "The parameter max_tokens will be overwritten by speculated number of tokens."
+                )
+                continue
+            if key not in self.spec_kwargs:
+                self.spec_kwargs[key] = value
+            else:
+                assert (
+                    value == self.spec_kwargs[key]
+                ), "sampling parameters should be consistent if turn on api speculative execution."
+        self.spec_format.append(
+            {"text": "", "stop": params["stop"], "name": spec_var_name}
+        )
+        return "", {}
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+        spec_var_name: str = None,
+    ):
+        if sampling_params.dtype is None:
+            if self.is_chat_model:
+                if s.num_api_spec_tokens is None:
+                    if not s.text_.endswith(self.chat_prefix):
+                        raise RuntimeError(
+                            "This use case is not supported if api speculative execution is off. "
+                            "For OpenAI chat models, sgl.gen must be right after sgl.assistant. "
+                            "Example of adding api speculative execution: @function(num_api_spec_tokens=128)."
+                        )
+                    prompt = s.messages_
+                else:
+                    return self._prepare_spec_execution(
+                        sampling_params, s.num_api_spec_tokens, spec_var_name
+                    )
+            else:
+                prompt = s.text_
+            kwargs = sampling_params.to_openai_kwargs()
+            if (
+                self.model_name.startswith("o1")
+                or self.model_name.startswith("o3")
+                or "o1" in self.model_name
+            ):
+                kwargs.pop("max_tokens", None)
+            else:
+                kwargs.pop("max_completion_tokens", None)
+            comp = openai_completion(
+                client=self.client,
+                token_usage=self.token_usage,
+                is_chat=self.is_chat_model,
+                model=self.model_name,
+                prompt=prompt,
+                **kwargs,
+            )
+            # Keep the returned list (or string) as is.
+        elif sampling_params.dtype in [str, "str", "string"]:
+            assert (
+                not self.is_chat_model
+            ), "constrained type not supported on chat model"
+            kwargs = sampling_params.to_openai_kwargs()
+            kwargs.pop("stop")
+            comp = openai_completion(
+                client=self.client,
+                token_usage=self.token_usage,
+                is_chat=self.is_chat_model,
+                model=self.model_name,
+                prompt=s.text_ + '"',
+                stop='"',
+                **kwargs,
+            )
+            # Wrap each element in quotes if we have a list.
+            if isinstance(comp, list):
+                comp = ['"' + x + '"' for x in comp]
+            else:
+                comp = '"' + comp + '"'
+        elif sampling_params.dtype in [int, "int"]:
+            assert (
+                not self.is_chat_model
+            ), "constrained type not supported on chat model"
+            kwargs = sampling_params.to_openai_kwargs()
+            kwargs.pop("stop")
+            comp = openai_completion(
+                client=self.client,
+                token_usage=self.token_usage,
+                is_chat=self.is_chat_model,
+                model=self.model_name,
+                prompt=s.text_,
+                logit_bias=self.logit_bias_int,
+                stop=[" "],
+                **kwargs,
+            )
+            # Leave as a list if that's what is returned.
+        else:
+            raise ValueError(f"Unknown dtype: {sampling_params.dtype}")
+        return comp, {}
+    def spec_fill(self, value: str):
+        assert self.is_chat_model
+        self.spec_format.append({"text": value, "stop": None, "name": None})
+    def spec_pattern_match(self, comp):
+        for i, term in enumerate(self.spec_format):
+            text = term["text"]
+            if text != "":
+                if comp.startswith(text):
+                    comp = comp[len(text) :]
+                else:
+                    return False
+            else:
+                pos = comp.find(term["stop"])
+                if pos != -1:
+                    term["text"] = comp[:pos]
+                    comp = comp[pos:]
+                else:
+                    if i == len(self.spec_format) - 1:
+                        term["text"] = comp
+                    else:
+                        return False
+        return True
+    def role_end_generate(
+        self,
+        s: StreamExecutor,
+    ):
+        if s.num_api_spec_tokens is None or not s.text_.endswith(self.chat_prefix):
+            return
+        comp = ""
+        if not all(x["name"] is None for x in self.spec_format):
+            # TODO(ying): throw errors or warnings
+            for i in range(self.spec_max_num_tries):
+                comp = openai_completion(
+                    client=self.client,
+                    token_usage=self.token_usage,
+                    is_chat=self.is_chat_model,
+                    model=self.model_name,
+                    prompt=s.messages_,
+                    **self.spec_kwargs,
+                )
+                # Use a string for pattern matching.
+                comp_for_match = comp[0] if isinstance(comp, list) else comp
+                if self.spec_pattern_match(comp_for_match):
+                    break
+        for term in self.spec_format:
+            s.text_ += term["text"]
+            name = term["name"]
+            if name is not None:
+                s.variables[name] = term["text"]
+                s.meta_info[name] = {}
+                s.variable_event[name].set()
+        self.spec_kwargs = {}
+        self.spec_format = []
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if sampling_params.dtype is None:
+            if self.is_chat_model:
+                if not s.text_.endswith(self.chat_prefix):
+                    raise RuntimeError(
+                        "This use case is not supported. "
+                        "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
+                    )
+                prompt = s.messages_
+            else:
+                prompt = s.text_
+            kwargs = sampling_params.to_openai_kwargs()
+            generator = openai_completion_stream(
+                client=self.client,
+                token_usage=self.token_usage,
+                is_chat=self.is_chat_model,
+                model=self.model_name,
+                prompt=prompt,
+                **kwargs,
+            )
+            return generator
+        else:
+            raise ValueError(f"Unknown dtype: {sampling_params.dtype}")
+    def select(
+        self,
+        s: StreamExecutor,
+        choices: List[str],
+        temperature: float,
+        choices_method: ChoicesSamplingMethod,
+    ) -> ChoicesDecision:
+        """Note: `choices_method` is not used by the OpenAI backend."""
+        if self.is_chat_model:
+            raise NotImplementedError(
+                "select/choices is not supported for chat models. "
+                "Please try to use a non-chat model such as gpt-3.5-turbo-instruct"
+            )
+        n_choices = len(choices)
+        token_ids = [self.tokenizer.encode(x) for x in choices]
+        scores = [0] * n_choices
+        valid = [len(x) > 0 for x in token_ids]
+        prompt_tokens = self.tokenizer.encode(s.text_)
+        max_len = max([len(x) for x in token_ids])
+        for step in range(max_len):
+            # Build logit bias
+            logit_bias = {}
+            for i in range(n_choices):
+                if valid[i]:
+                    logit_bias[token_ids[i][step]] = 100
+            # Call API
+            ret = self.client.completions.create(
+                model=self.model_name,
+                prompt=prompt_tokens,
+                logit_bias=logit_bias,
+                max_tokens=1,
+                temperature=temperature,
+            )
+            ret_str = ret.choices[0].text
+            ret_token = self.tokenizer.encode(ret_str)[0]
+            self.token_usage.prompt_tokens += ret.usage.prompt_tokens
+            self.token_usage.completion_tokens = ret.usage.completion_tokens
+            # TODO:
+            # 1. return logits as the scores
+            # 2. compute logits of the full choice
+            # 3. consider chunk-based decoding
+            # Update valid
+            hit = False
+            for i in range(n_choices):
+                if valid[i]:
+                    if step == len(token_ids[i]) - 1:
+                        valid[i] = False
+                    if ret_token == token_ids[i][step]:
+                        scores[i] += 1
+                        hit = True
+                    else:
+                        valid[i] = False
+            assert hit
+            if np.sum(valid) <= 1:
+                break
+            prompt_tokens.append(ret_token)
+        return ChoicesDecision(
+            decision=choices[np.argmax(scores)],
+            meta_info={"scores": scores},
+        )
+def openai_completion(
+    client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
+) -> Union[str, List[str]]:
+    # if "ebnf" is in kwargs, warn and remove
+    if "ebnf" in kwargs:
+        warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
+        del kwargs["ebnf"]
+    for attempt in range(retries):
+        try:
+            if is_chat:
+                if "stop" in kwargs and kwargs["stop"] is None:
+                    kwargs.pop("stop")
+                ret = client.chat.completions.create(messages=prompt, **kwargs)
+                if len(ret.choices) == 1:
+                    comp = ret.choices[0].message.content
+                else:
+                    comp = [c.message.content for c in ret.choices]
+            else:
+                ret = client.completions.create(prompt=prompt, **kwargs)
+                if isinstance(prompt, (list, tuple)):
+                    comp = [c.text for c in ret.choices]
+                else:
+                    comp = ret.choices[0].text
+                    if len(ret.choices) > 1:
+                        comp = [c.text for c in ret.choices]
+            token_usage.prompt_tokens += ret.usage.prompt_tokens
+            token_usage.completion_tokens += ret.usage.completion_tokens
+            break
+        except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
+            logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
+            time.sleep(5)
+            if attempt == retries - 1:
+                raise e
+        except Exception as e:
+            logger.error(f"RuntimeError {e}.")
+            raise e
+    return comp
+def openai_completion_stream(
+    client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
+):
+    # if "ebnf" is in kwargs, warn and remove
+    if "ebnf" in kwargs:
+        warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
+        del kwargs["ebnf"]
+    for attempt in range(retries):
+        try:
+            if is_chat:
+                if "stop" in kwargs and kwargs["stop"] is None:
+                    kwargs.pop("stop")
+                generator = client.chat.completions.create(
+                    messages=prompt,
+                    stream=True,
+                    stream_options={"include_usage": True},
+                    **kwargs,
+                )
+                for ret in generator:
+                    if len(ret.choices) == 0:
+                        continue
+                    try:
+                        content = ret.choices[0].delta.content
+                    except IndexError:
+                        content = None
+                    yield content or "", {}
+            else:
+                generator = client.completions.create(
+                    prompt=prompt,
+                    stream=True,
+                    stream_options={"include_usage": True},
+                    **kwargs,
+                )
+                for ret in generator:
+                    if len(ret.choices) == 0:
+                        continue
+                    content = ret.choices[0].text
+                    yield content or "", {}
+            token_usage.prompt_tokens += ret.usage.prompt_tokens
+            token_usage.completion_tokens += ret.usage.completion_tokens
+            break
+        except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
+            logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
+            time.sleep(5)
+            if attempt == retries - 1:
+                raise e
+        except Exception as e:
+            logger.error(f"RuntimeError {e}.")
+            raise e

sglang/lang/backend/runtime_endpoint.py ADDED Viewed

	@@ -0,0 +1,527 @@

+import atexit
+import json
+import multiprocessing
+import warnings
+from typing import Dict, List, Optional, Union
+import aiohttp
+import requests
+from sglang.global_config import global_config
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
+from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import (
+    REGEX_BOOL,
+    REGEX_FLOAT,
+    REGEX_INT,
+    REGEX_STR,
+    SglSamplingParams,
+)
+from sglang.utils import http_request
+class RuntimeEndpoint(BaseBackend):
+    def __init__(
+        self,
+        base_url: str,
+        api_key: Optional[str] = None,
+        verify: Optional[str] = None,
+        chat_template_name: Optional[str] = None,
+    ):
+        super().__init__()
+        self.support_concate_and_append = True
+        self.base_url = base_url
+        self.api_key = api_key
+        self.verify = verify
+        res = http_request(
+            self.base_url + "/get_model_info",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        self.model_info = res.json()
+        if chat_template_name:
+            self.chat_template = get_chat_template(chat_template_name)
+        else:
+            self.chat_template = get_chat_template_by_model_path(
+                self.model_info["model_path"]
+            )
+    def get_model_name(self):
+        return self.model_info["model_path"]
+    def flush_cache(self):
+        res = http_request(
+            self.base_url + "/flush_cache",
+            api_key=self.api_key,
+            verify=self.verify,
+            method="POST",
+        )
+        self._assert_success(res)
+    def get_server_info(self):
+        res = http_request(
+            self.base_url + "/get_server_info",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        return res.json()
+    def get_chat_template(self):
+        return self.chat_template
+    def cache_prefix(self, prefix_str: str):
+        res = http_request(
+            self.base_url + "/generate",
+            json={"text": prefix_str, "sampling_params": {"max_new_tokens": 0}},
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+    def start_profile(self):
+        res = http_request(
+            self.base_url + "/start_profile",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+    def stop_profile(self):
+        res = http_request(
+            self.base_url + "/stop_profile",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+    def commit_lazy_operations(self, s: StreamExecutor):
+        data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+    def fill_image(self, s: StreamExecutor):
+        data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+    def _handle_dtype_to_regex(self, sampling_params: SglSamplingParams):
+        if sampling_params.dtype is None:
+            return
+        if sampling_params.stop == ():
+            sampling_params.stop = []
+        dtype_regex = None
+        if sampling_params.dtype in ["int", int]:
+            dtype_regex = REGEX_INT
+            sampling_params.stop.extend([" ", "\n"])
+        elif sampling_params.dtype in ["float", float]:
+            dtype_regex = REGEX_FLOAT
+            sampling_params.stop.extend([" ", "\n"])
+        elif sampling_params.dtype in ["str", str]:
+            dtype_regex = REGEX_STR
+        elif sampling_params.dtype in ["bool", bool]:
+            dtype_regex = REGEX_BOOL
+        else:
+            raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+        if dtype_regex is not None and sampling_params.regex is not None:
+            warnings.warn(
+                f"Both dtype and regex are set. Only dtype will be used. dtype: {sampling_params.dtype}, regex: {sampling_params.regex}"
+            )
+        sampling_params.regex = dtype_regex
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        self._handle_dtype_to_regex(sampling_params)
+        data = {
+            "text": s.text_,
+            "sampling_params": {
+                "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
+                **sampling_params.to_srt_kwargs(),
+            },
+        }
+        for item in [
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
+        ]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                data[item] = value
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        obj = res.json()
+        comp = obj["text"]
+        return comp, obj["meta_info"]
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        self._handle_dtype_to_regex(sampling_params)
+        data = {
+            "text": s.text_,
+            "sampling_params": {
+                "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
+                **sampling_params.to_srt_kwargs(),
+            },
+        }
+        for item in [
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
+        ]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                data[item] = value
+        data["stream"] = True
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            stream=True,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        pos = 0
+        for chunk in res.iter_lines(decode_unicode=False):
+            chunk = chunk.decode("utf-8")
+            if chunk and chunk.startswith("data:"):
+                if chunk == "data: [DONE]":
+                    break
+                data = json.loads(chunk[5:].strip("\n"))
+                chunk_text = data["text"][pos:]
+                meta_info = data["meta_info"]
+                pos += len(chunk_text)
+                yield chunk_text, meta_info
+    def select(
+        self,
+        s: StreamExecutor,
+        choices: List[str],
+        temperature: float,
+        choices_method: ChoicesSamplingMethod,
+    ) -> ChoicesDecision:
+        assert temperature <= 1e-5
+        # Cache common prefix
+        data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
+        obj = self._generate_http_request(s, data)
+        prompt_len = obj["meta_info"]["prompt_tokens"]
+        logprob_start_len = max(prompt_len - 2, 0)  # For token healing
+        # Compute logprob
+        data = {
+            "text": [s.text_ + c for c in choices],
+            "sampling_params": {
+                "max_new_tokens": 0,
+                "temperature": 0,
+            },
+            "return_logprob": True,
+            "return_text_in_logprobs": True,
+            "logprob_start_len": logprob_start_len,
+        }
+        obj = self._generate_http_request(s, data)
+        input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
+        output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
+        normalized_prompt_logprobs = [
+            compute_normalized_prompt_logprobs(r["meta_info"]["input_token_logprobs"])
+            for r in obj
+        ]
+        # Remove extra token if no token healing occurred
+        for i in range(len(input_token_logprobs)):
+            healed_token_str = input_token_logprobs[i][0][-1]
+            if s.text_.endswith(healed_token_str):
+                healed_token_logprob = input_token_logprobs[i][0][0]
+                normalized_prompt_logprobs[i] = (
+                    normalized_prompt_logprobs[i] * len(input_token_logprobs[i])
+                    - healed_token_logprob
+                ) / (len(input_token_logprobs[i]) - 1)
+                input_token_logprobs[i] = input_token_logprobs[i][1:]
+        # Compute unconditional logprobs if required
+        if choices_method.requires_unconditional_logprobs:
+            input_ids = [[el[1] for el in subl] for subl in input_token_logprobs]
+            data = {
+                "input_ids": input_ids,
+                "sampling_params": {"max_new_tokens": 0},
+                "return_logprob": True,
+            }
+            obj = self._generate_http_request(s, data)
+            unconditional_token_logprobs = [
+                r["meta_info"]["input_token_logprobs"] for r in obj
+            ]
+        else:
+            unconditional_token_logprobs = None
+        return choices_method(
+            choices=choices,
+            normalized_prompt_logprobs=normalized_prompt_logprobs,
+            input_token_logprobs=input_token_logprobs,
+            output_token_logprobs=output_token_logprobs,
+            unconditional_token_logprobs=unconditional_token_logprobs,
+        )
+    def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
+        res = http_request(
+            self.base_url + "/concate_and_append_request",
+            json={"src_rids": src_rids, "dst_rid": dst_rid},
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+    def _generate_http_request(self, s: StreamExecutor, data):
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        return res.json()
+    def _add_images(self, s: StreamExecutor, data):
+        if s.images_:
+            assert len(s.images_) == 1, "Only support one image."
+            data["image_data"] = s.images_[0][1]
+    def _assert_success(self, res):
+        if res.status_code != 200:
+            try:
+                content = res.json()
+            except json.JSONDecodeError:
+                content = res.text
+            raise RuntimeError(content)
+def compute_normalized_prompt_logprobs(input_logprobs):
+    values = [x[0] for x in input_logprobs if x[0]]
+    return sum(values) / len(values)
+class Runtime:
+    """
+    A wrapper for the HTTP server.
+    This is used for launching the server in a python program without
+    using the command line interface.
+    It is mainly used for the frontend language.
+    You should use the Engine class if you want to do normal offline processing without the frontend language.
+    """
+    def __init__(
+        self,
+        log_level: str = "error",
+        *args,
+        **kwargs,
+    ):
+        """See the arguments in server_args.py::ServerArgs"""
+        # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
+        # client code without installing SRT server and its dependency if they want.
+        from sglang.srt.entrypoints.http_server import launch_server
+        from sglang.srt.server_args import ServerArgs
+        from sglang.srt.utils import is_port_available
+        self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
+        # Pre-allocate ports
+        for port in range(self.server_args.port, 40000):
+            if is_port_available(port):
+                break
+        self.server_args.port = port
+        self.url = self.server_args.url()
+        self.generate_url = self.url + "/generate"
+        # NOTE: We store pid instead of proc to fix some issues during __delete__
+        self.pid = None
+        pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
+        ctx = multiprocessing.get_context("spawn")
+        proc = ctx.Process(
+            target=launch_server,
+            args=(self.server_args, pipe_writer),
+        )
+        proc.start()
+        pipe_writer.close()
+        self.pid = proc.pid
+        # Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
+        atexit.register(self.shutdown)
+        # TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
+        try:
+            init_state = pipe_reader.recv()
+        except EOFError:
+            init_state = ""
+        if init_state != "ready":
+            self.shutdown()
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+        self.endpoint = RuntimeEndpoint(self.url)
+    def shutdown(self):
+        from sglang.srt.utils import kill_process_tree
+        if self.pid is not None:
+            kill_process_tree(self.pid)
+            self.pid = None
+    def start_profile(self):
+        self.endpoint.start_profile()
+    def stop_profile(self):
+        self.endpoint.stop_profile()
+    def cache_prefix(self, prefix: str):
+        self.endpoint.cache_prefix(prefix)
+    def get_tokenizer(self):
+        from sglang.srt.utils.hf_transformers_utils import get_tokenizer
+        return get_tokenizer(
+            self.server_args.tokenizer_path,
+            tokenizer_mode=self.server_args.tokenizer_mode,
+            trust_remote_code=self.server_args.trust_remote_code,
+            revision=self.server_args.revision,
+        )
+    async def async_generate(
+        self,
+        prompt: str,
+        sampling_params: Optional[Dict] = None,
+    ):
+        if self.server_args.skip_tokenizer_init:
+            json_data = {
+                "input_ids": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        else:
+            json_data = {
+                "text": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        pos = 0
+        timeout = aiohttp.ClientTimeout(total=3 * 3600)
+        async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
+            async with session.post(self.generate_url, json=json_data) as response:
+                async for chunk, _ in response.content.iter_chunks():
+                    chunk = chunk.decode("utf-8")
+                    if chunk and chunk.startswith("data:"):
+                        if chunk == "data: [DONE]\n\n":
+                            break
+                        data = json.loads(chunk[5:].strip("\n"))
+                        if "text" in data:
+                            cur = data["text"][pos:]
+                            if cur:
+                                yield cur
+                            pos += len(cur)
+                        else:
+                            yield data
+    add_request = async_generate
+    def generate(
+        self,
+        prompt: Union[str, List[str]],
+        sampling_params: Optional[Dict] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+    ):
+        json_data = {
+            "text": prompt,
+            "sampling_params": sampling_params,
+            "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
+            "top_logprobs_num": top_logprobs_num,
+            "lora_path": lora_path,
+        }
+        assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
+        response = requests.post(
+            self.url + "/generate",
+            json=json_data,
+        )
+        return json.dumps(response.json())
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+    ):
+        json_data = {"text": prompt}
+        response = requests.post(self.url + "/encode", json=json_data)
+        return json.dumps(response.json())
+    async def get_server_info(self):
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{self.url}/get_server_info") as response:
+                if response.status == 200:
+                    return await response.json()
+                else:
+                    error_data = await response.json()
+                    raise RuntimeError(
+                        f"Failed to get server info. {error_data['error']['message']}"
+                    )
+    def __del__(self):
+        self.shutdown()

sglang/lang/backend/vertexai.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import warnings
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+try:
+    import vertexai
+    from vertexai.preview.generative_models import (
+        GenerationConfig,
+        GenerativeModel,
+        Image,
+    )
+except ImportError as e:
+    GenerativeModel = e
+class VertexAI(BaseBackend):
+    def __init__(self, model_name, safety_settings=None):
+        super().__init__()
+        if isinstance(GenerativeModel, Exception):
+            raise GenerativeModel
+        project_id = os.environ["GCP_PROJECT_ID"]
+        location = os.environ.get("GCP_LOCATION")
+        vertexai.init(project=project_id, location=location)
+        self.model_name = model_name
+        self.chat_template = get_chat_template("default")
+        self.safety_settings = safety_settings
+    def get_chat_template(self):
+        return self.chat_template
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            prompt = self.messages_to_vertexai_input(s.messages_)
+        else:
+            # single-turn
+            prompt = (
+                self.text_to_vertexai_input(s.text_, s.cur_images)
+                if s.cur_images
+                else s.text_
+            )
+        ret = GenerativeModel(self.model_name).generate_content(
+            prompt,
+            generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
+            safety_settings=self.safety_settings,
+        )
+        comp = ret.text
+        return comp, {}
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            prompt = self.messages_to_vertexai_input(s.messages_)
+        else:
+            # single-turn
+            prompt = (
+                self.text_to_vertexai_input(s.text_, s.cur_images)
+                if s.cur_images
+                else s.text_
+            )
+        generator = GenerativeModel(self.model_name).generate_content(
+            prompt,
+            stream=True,
+            generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
+            safety_settings=self.safety_settings,
+        )
+        for ret in generator:
+            yield ret.text, {}
+    def text_to_vertexai_input(self, text, images):
+        input = []
+        # split with image token
+        text_segs = text.split(self.chat_template.image_token)
+        for image_path, image_base64_data in images:
+            text_seg = text_segs.pop(0)
+            if text_seg != "":
+                input.append(text_seg)
+            input.append(Image.from_bytes(image_base64_data))
+        text_seg = text_segs.pop(0)
+        if text_seg != "":
+            input.append(text_seg)
+        return input
+    def messages_to_vertexai_input(self, messages):
+        vertexai_message = []
+        # from openai message format to vertexai message format
+        for msg in messages:
+            if isinstance(msg["content"], str):
+                text = msg["content"]
+            else:
+                text = msg["content"][0]["text"]
+            if msg["role"] == "system":
+                warnings.warn("Warning: system prompt is not supported in VertexAI.")
+                vertexai_message.append(
+                    {
+                        "role": "user",
+                        "parts": [{"text": "System prompt: " + text}],
+                    }
+                )
+                vertexai_message.append(
+                    {
+                        "role": "model",
+                        "parts": [{"text": "Understood."}],
+                    }
+                )
+                continue
+            if msg["role"] == "user":
+                vertexai_msg = {
+                    "role": "user",
+                    "parts": [{"text": text}],
+                }
+            elif msg["role"] == "assistant":
+                vertexai_msg = {
+                    "role": "model",
+                    "parts": [{"text": text}],
+                }
+            # images
+            if isinstance(msg["content"], list) and len(msg["content"]) > 1:
+                for image in msg["content"][1:]:
+                    assert image["type"] == "image_url"
+                    vertexai_msg["parts"].append(
+                        {
+                            "inline_data": {
+                                "data": image["image_url"]["url"].split(",")[1],
+                                "mime_type": "image/jpeg",
+                            }
+                        }
+                    )
+            vertexai_message.append(vertexai_msg)
+        return vertexai_message

sglang/lang/chat_template.py ADDED Viewed

	@@ -0,0 +1,668 @@

+import re
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Callable, Dict, List, Tuple
+class ChatTemplateStyle(Enum):
+    PLAIN = auto()
+    LLAMA2 = auto()
+@dataclass
+class ChatTemplate:
+    name: str
+    default_system_prompt: str
+    role_prefix_and_suffix: Dict[str, Tuple[str, str]]
+    stop_str: List[str] = ()
+    image_token: str = "<image>"
+    audio_token: str = "<audio>"
+    style: ChatTemplateStyle = ChatTemplateStyle.PLAIN
+    def get_prefix_and_suffix(
+        self, role: str, hist_messages: List[Dict]
+    ) -> Tuple[str, str]:
+        prefix, suffix = self.role_prefix_and_suffix.get(role, ("", ""))
+        if self.style == ChatTemplateStyle.LLAMA2:
+            if role == "system" and not hist_messages:
+                user_prefix, _ = self.role_prefix_and_suffix.get("user", ("", ""))
+                system_prefix, system_suffix = self.role_prefix_and_suffix.get(
+                    "system", ("", "")
+                )
+                return (user_prefix + system_prefix, system_suffix)
+            elif (
+                role == "user"
+                and len(hist_messages) == 1
+                and hist_messages[0]["content"] is not None
+            ):
+                return ("", suffix)
+        return prefix, suffix
+    def get_prompt(self, messages: List[Dict]) -> str:
+        prompt = ""
+        for i, message in enumerate(messages):
+            role, content = message["role"], message["content"]
+            if role == "system" and content is None:
+                content = self.default_system_prompt
+                if content is None:
+                    continue
+            prefix, suffix = self.get_prefix_and_suffix(role, messages[:i])
+            prompt += f"{prefix}{content}{suffix}"
+        return prompt
+chat_template_registry: Dict[str, ChatTemplate] = {}
+matching_function_registry: List[Callable] = []
+def register_chat_template(template):
+    chat_template_registry[template.name] = template
+def register_chat_template_matching_function(func):
+    matching_function_registry.append(func)
+def get_chat_template(name):
+    return chat_template_registry[name]
+def get_chat_template_by_model_path(model_path):
+    for matching_func in matching_function_registry:
+        template_name = matching_func(model_path)
+        if template_name is not None:
+            return get_chat_template(template_name)
+    return get_chat_template("default")
+register_chat_template(
+    ChatTemplate(
+        name="default",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("SYSTEM:", "\n"),
+            "user": ("USER:", "\n"),
+            "assistant": ("ASSISTANT:", "\n"),
+        },
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="claude",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("\n\nHuman: ", ""),
+            "assistant": ("\n\nAssistant:", ""),
+        },
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="chatml",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="chatml-llava",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+        image_token="<image>\n",
+    )
+)
+# There is default system prompt for qwen
+# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
+# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+register_chat_template(
+    ChatTemplate(
+        name="qwen",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+    )
+)
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+register_chat_template(
+    ChatTemplate(
+        name="qwen2-vl",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
+# Reference: https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
+register_chat_template(
+    ChatTemplate(
+        name="vicuna_v1.1",
+        default_system_prompt=(
+            "A chat between a curious user and an artificial intelligence assistant. "
+            "The assistant gives helpful, detailed, and polite answers to the user's questions."
+        ),
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("USER:", " "),
+            "assistant": ("ASSISTANT:", "</s>"),
+        },
+        image_token=" <image>\n",
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="llama-2-chat",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("<<SYS>>\n", "\n<</SYS>>\n\n"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
+        },
+        style=ChatTemplateStyle.LLAMA2,
+    )
+)
+# Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
+register_chat_template(
+    ChatTemplate(
+        name="mistral",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
+        },
+        stop_str=("</s>",),
+        image_token="[IMG]",
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="llama-3-instruct",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|start_header_id|>system<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "user": (
+                "<|start_header_id|>user<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "assistant": (
+                "<|start_header_id|>assistant<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+        },
+        stop_str=("<|eot_id|>",),
+        image_token="<|image|>",
+    )
+)
+# https://huggingface.co/openbmb/MiniCPM-V-2_6
+register_chat_template(
+    ChatTemplate(
+        name="minicpmv",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("user:", " "),
+            "assistant": ("assistant:", "</s>"),
+        },
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="janus-pro",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "User": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+        image_token="<image_placeholder>\n",
+    )
+)
+# https://huggingface.co/openbmb/MiniCPM-o-2_6
+register_chat_template(
+    ChatTemplate(
+        name="minicpmo",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("user:", " "),
+            "assistant": ("assistant:", "</s>"),
+        },
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+        audio_token="(<audio>./</audio>)",
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="janus",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "user": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+        image_token="<image_placeholder>\n",
+    )
+)
+# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
+register_chat_template(
+    ChatTemplate(
+        name="llama-3-instruct-llava",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|start_header_id|>system<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "user": (
+                "<|start_header_id|>user<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "assistant": (
+                "<|start_header_id|>assistant<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+        },
+        stop_str=("<|eot_id|>",),
+        image_token="<image>\n",
+    )
+)
+# Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
+register_chat_template(
+    ChatTemplate(
+        name="llama-4",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|header_start|>system<|header_end|>\n\n",
+                "<|eot|>",
+            ),
+            "user": (
+                "<|header_start|>user<|header_end|>\n\n",
+                "<|eot|>",
+            ),
+            "assistant": (
+                "<|header_start|>assistant<|header_end|>\n\n",
+                "<|eot|>",
+            ),
+        },
+        stop_str=("<|eot|>",),
+        image_token="<|image|>",
+    )
+)
+# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
+register_chat_template(
+    ChatTemplate(
+        name="yi-1.5",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
+            "assistant": ("", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+    )
+)
+# Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
+register_chat_template(
+    ChatTemplate(
+        name="yi-vl",
+        default_system_prompt=(
+            "This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
+            "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
+        ),
+        role_prefix_and_suffix={
+            "system": ("", "\n\n"),
+            "user": ("### Human:", "\n"),
+            "assistant": ("### Assistant:", "\n"),
+        },
+        image_token=" <image_placeholder>\n",
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="gemma-it",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+            "assistant": ("<start_of_turn>model\n", "<end_of_turn>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="dbrx-instruct",
+        default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>"),
+            "user": ("\n<|im_start|>user\n", "<|im_end|>"),
+            "assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
+        },
+        stop_str=("<|im_end|>",),
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="c4ai-command-r",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
+                "<|END_OF_TURN_TOKEN|>",
+            ),
+            "user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
+            "assistant": (
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+                "<|END_OF_TURN_TOKEN|>",
+            ),
+        },
+        style=ChatTemplateStyle.PLAIN,
+    )
+)
+# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+register_chat_template(
+    ChatTemplate(
+        name="internvl-2-5",
+        default_system_prompt="你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="interns1",
+        default_system_prompt="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).  It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="granite-3-instruct",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|start_of_role|>system<|end_of_role|>",
+                "<|end_of_text|>",
+            ),
+            "user": (
+                "<|start_of_role|>user<|end_of_role|>",
+                "<|end_of_text|>",
+            ),
+            "assistant": (
+                "<|start_of_role|>assistant<|end_of_role|>",
+                "<|end_of_text|>",
+            ),
+        },
+        stop_str=("<|end_of_text|>",),
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="deepseek-v3",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "user": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+    )
+)
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/glm4_v#usage-example
+register_chat_template(
+    ChatTemplate(
+        name="glm-4v",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("<|system|>\n", "\n"),
+            "user": ("<|user|>\n", "\n"),
+            "assistant": ("<|assistant|>\n", "\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=["<|user|>", "<|endoftext|>", "<|observation|>"],
+        image_token="<|image|>",
+    )
+)
+@register_chat_template_matching_function
+def match_deepseek(model_path: str):
+    if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
+        r"base", model_path, re.IGNORECASE
+    ):
+        return "deepseek-v3"
+@register_chat_template_matching_function
+def match_orion(model_path: str):
+    if "orion" in model_path.lower():
+        return "claude"
+@register_chat_template_matching_function
+def match_deepseek_janus_pro(model_path: str):
+    if re.search(r"janus", model_path, re.IGNORECASE):
+        return "janus-pro"
+@register_chat_template_matching_function
+def match_dbrx(model_path: str):
+    if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
+        r"instruct", model_path, re.IGNORECASE
+    ):
+        return "dbrx-instruct"
+@register_chat_template_matching_function
+def match_vicuna(model_path: str):
+    if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
+        return "vicuna_v1.1"
+@register_chat_template_matching_function
+def match_llama2_chat(model_path: str):
+    if re.search(
+        r"llama-2.*chat|codellama.*instruct",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "llama-2-chat"
+@register_chat_template_matching_function
+def match_mistral(model_path: str):
+    if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
+        return "mistral"
+@register_chat_template_matching_function
+def match_llama3_instruct(model_path: str):
+    if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
+        return "llama-3-instruct"
+@register_chat_template_matching_function
+def match_chat_ml(model_path: str):
+    if re.search(r"tinyllama", model_path, re.IGNORECASE):
+        return "chatml"
+    if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
+        return "qwen2-vl"
+    if re.search(r"glm[-_]?4(\.\d+)?v", model_path, re.IGNORECASE):
+        return "glm-4v"
+    if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
+    ):
+        return "qwen"
+    if re.search(
+        r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "chatml-llava"
+@register_chat_template_matching_function
+def match_chat_yi(model_path: str):
+    if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
+    ):
+        return "yi-vl"
+    elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
+        return "yi-1.5"
+@register_chat_template_matching_function
+def match_gemma_it(model_path: str):
+    if re.search(r"gemma.*it", model_path, re.IGNORECASE):
+        return "gemma-it"
+@register_chat_template_matching_function
+def match_openbmb_minicpm(model_path: str):
+    if re.search(r"minicpm-v", model_path, re.IGNORECASE):
+        return "minicpmv"
+    elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
+        return "minicpmo"
+@register_chat_template_matching_function
+def match_c4ai_command_r(model_path: str):
+    if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
+        return "c4ai-command-r"
+@register_chat_template_matching_function
+def match_granite_instruct(model_path: str):
+    if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
+        return "granite-3-instruct"
+@register_chat_template_matching_function
+def match_gemma3_instruct(model_path: str):
+    if re.search(r"gemma-3", model_path, re.IGNORECASE):
+        return "gemma-it"
+@register_chat_template_matching_function
+def match_internvl_chat(model_path: str):
+    if re.search(r"internvl2_5", model_path, re.IGNORECASE):
+        return "internvl-2-5"
+@register_chat_template_matching_function
+def match_interns1_chat(model_path: str):
+    if re.search(r"intern-s1", model_path, re.IGNORECASE):
+        return "interns1"
+    if re.search(r"interns1", model_path, re.IGNORECASE):
+        return "interns1"
+if __name__ == "__main__":
+    messages = [
+        {"role": "system", "content": None},  # None means default
+        # {"role": "system", "content": "You are a helpful, respectful and honest assistant."},
+        {"role": "user", "content": "Hello!"},
+        {"role": "assistant", "content": "Hi!"},
+        {"role": "user", "content": "What can you do?"},
+        {"role": "assistant", "content": "I can chat with you."},
+    ]
+    template = get_chat_template("llama-2-chat")
+    print(template.get_prompt(messages))