| import os |
| import unittest |
| from types import SimpleNamespace |
|
|
| import requests |
|
|
| from sglang.srt.environ import envs |
| from sglang.srt.utils import kill_process_tree |
| from sglang.test.ci.ci_register import register_cuda_ci |
| from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k |
| from sglang.test.test_utils import ( |
| DEFAULT_DRAFT_MODEL_STANDALONE, |
| DEFAULT_TARGET_MODEL_STANDALONE, |
| DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| DEFAULT_URL_FOR_TEST, |
| CustomTestCase, |
| popen_launch_server, |
| ) |
|
|
| |
| register_cuda_ci(est_time=308, suite="stage-b-test-large-1-gpu") |
|
|
| GSM_DATASET_PATH = None |
|
|
|
|
| |
| DEFAULT_SERVER_ARGS = [ |
| "--trust-remote-code", |
| "--cuda-graph-max-bs", |
| "8", |
| "--speculative-algorithm", |
| "STANDALONE", |
| "--speculative-draft-model-path", |
| DEFAULT_DRAFT_MODEL_STANDALONE, |
| "--speculative-num-steps", |
| "4", |
| "--speculative-eagle-topk", |
| "2", |
| "--speculative-num-draft-tokens", |
| "7", |
| "--mem-fraction-static", |
| 0.7, |
| ] |
|
|
| |
| DEFAULT_SERVER_ARGS_V2 = [ |
| "--trust-remote-code", |
| "--cuda-graph-max-bs", |
| "8", |
| "--speculative-algorithm", |
| "STANDALONE", |
| "--speculative-draft-model-path", |
| DEFAULT_DRAFT_MODEL_STANDALONE, |
| "--speculative-num-steps", |
| "4", |
| "--speculative-eagle-topk", |
| "1", |
| "--speculative-num-draft-tokens", |
| "5", |
| "--mem-fraction-static", |
| 0.7, |
| ] |
|
|
|
|
| class TestStandaloneSpeculativeDecodingBase(CustomTestCase): |
|
|
| model = DEFAULT_TARGET_MODEL_STANDALONE |
| draft_model = DEFAULT_DRAFT_MODEL_STANDALONE |
| base_url = DEFAULT_URL_FOR_TEST |
| accuracy_threshold = 0.7 |
| spec_decode_threshold = 3.6 |
|
|
| @classmethod |
| def get_server_args(cls): |
| """Return the arguments for the server launch. Override in subclasses.""" |
| return DEFAULT_SERVER_ARGS + ["--attention-backend", "fa3"] |
|
|
| @classmethod |
| def setUpClass(cls): |
| |
| |
| envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.set(False) |
| envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False) |
| model = cls.model |
| cls.process = popen_launch_server( |
| model, |
| cls.base_url, |
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| other_args=cls.get_server_args(), |
| ) |
|
|
| @classmethod |
| def tearDownClass(cls): |
| kill_process_tree(cls.process.pid) |
|
|
| def test_gsm8k(self): |
| requests.get(self.base_url + "/flush_cache") |
|
|
| args = SimpleNamespace( |
| num_shots=4, |
| num_questions=100, |
| max_new_tokens=512, |
| parallel=128, |
| host="http://127.0.0.1", |
| port=int(self.base_url.split(":")[-1]), |
| data_path=GSM_DATASET_PATH, |
| ) |
| metrics = run_eval_few_shot_gsm8k(args) |
| print(f"{metrics=}") |
|
|
| |
| metric_key = "accuracy" |
| self.assertGreater(metrics[metric_key], self.accuracy_threshold) |
|
|
| server_info = requests.get(self.base_url + "/get_server_info") |
| avg_spec_accept_length = server_info.json()["internal_states"][0][ |
| "avg_spec_accept_length" |
| ] |
| print(f"{avg_spec_accept_length=}") |
| self.assertGreater(avg_spec_accept_length, self.spec_decode_threshold) |
|
|
|
|
| class TestStandaloneV2SpeculativeDecodingBase(CustomTestCase): |
|
|
| model = DEFAULT_TARGET_MODEL_STANDALONE |
| draft_model = DEFAULT_DRAFT_MODEL_STANDALONE |
| base_url = DEFAULT_URL_FOR_TEST |
| accuracy_threshold = 0.7 |
| spec_decode_threshold = 3.6 |
|
|
| @classmethod |
| def get_server_args(cls): |
| """Return the arguments for the server launch. Override in subclasses.""" |
| return DEFAULT_SERVER_ARGS_V2 + ["--attention-backend", "fa3"] |
|
|
| @classmethod |
| def setUpClass(cls): |
| |
| |
| envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.set(False) |
| envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False) |
| envs.SGLANG_ENABLE_SPEC_V2.set(True) |
| model = cls.model |
| cls.process = popen_launch_server( |
| model, |
| cls.base_url, |
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| other_args=cls.get_server_args(), |
| ) |
|
|
| @classmethod |
| def tearDownClass(cls): |
| kill_process_tree(cls.process.pid) |
| if "SGLANG_ENABLE_SPEC_V2" in os.environ: |
| envs.SGLANG_ENABLE_SPEC_V2.set(False) |
|
|
| def test_gsm8k(self): |
| requests.get(self.base_url + "/flush_cache") |
|
|
| args = SimpleNamespace( |
| num_shots=4, |
| num_questions=100, |
| max_new_tokens=512, |
| parallel=128, |
| host="http://127.0.0.1", |
| port=int(self.base_url.split(":")[-1]), |
| data_path=GSM_DATASET_PATH, |
| ) |
| metrics = run_eval_few_shot_gsm8k(args) |
| print(f"{metrics=}") |
|
|
| |
| metric_key = "accuracy" |
| self.assertGreater(metrics[metric_key], self.accuracy_threshold) |
|
|
| server_info = requests.get(self.base_url + "/get_server_info") |
| avg_spec_accept_length = server_info.json()["internal_states"][0][ |
| "avg_spec_accept_length" |
| ] |
| print(f"{avg_spec_accept_length=}") |
| self.assertGreater(avg_spec_accept_length, self.spec_decode_threshold) |
|
|
|
|
| class TestStandaloneSpeculativeDecodingTriton(TestStandaloneSpeculativeDecodingBase): |
|
|
| @classmethod |
| def get_server_args(cls): |
| return DEFAULT_SERVER_ARGS + ["--attention-backend", "triton"] |
|
|
|
|
| class TestStandaloneSpeculativeDecodingFlashinfer( |
| TestStandaloneSpeculativeDecodingBase |
| ): |
| @classmethod |
| def get_server_args(cls): |
| return DEFAULT_SERVER_ARGS + ["--attention-backend", "flashinfer"] |
|
|
|
|
| class TestStandaloneV2SpeculativeDecodingTriton( |
| TestStandaloneV2SpeculativeDecodingBase |
| ): |
|
|
| @classmethod |
| def get_server_args(cls): |
| return DEFAULT_SERVER_ARGS_V2 + ["--attention-backend", "triton"] |
|
|
|
|
| class TestStandaloneV2SpeculativeDecodingFlashinfer( |
| TestStandaloneV2SpeculativeDecodingBase |
| ): |
| @classmethod |
| def get_server_args(cls): |
| return DEFAULT_SERVER_ARGS_V2 + ["--attention-backend", "flashinfer"] |
|
|
|
|
| if __name__ == "__main__": |
| unittest.main() |
|
|