| """ |
| Usage: |
| python3 test/registered/mla/test_flashmla.py |
| """ |
|
|
| import unittest |
| from types import SimpleNamespace |
|
|
| import requests |
| import torch |
|
|
| from sglang.srt.utils import kill_process_tree |
| from sglang.test.ci.ci_register import register_cuda_ci |
| from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k |
| from sglang.test.test_utils import ( |
| DEFAULT_MODEL_NAME_FOR_TEST_MLA, |
| DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| DEFAULT_URL_FOR_TEST, |
| CustomTestCase, |
| popen_launch_server, |
| ) |
|
|
| |
| register_cuda_ci(est_time=284, suite="stage-b-test-large-1-gpu") |
|
|
|
|
| class TestFlashMLAAttnBackend(unittest.TestCase): |
| @classmethod |
| def setUpClass(cls): |
| cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA |
| cls.base_url = DEFAULT_URL_FOR_TEST |
| other_args = ["--trust-remote-code"] |
| if torch.cuda.is_available() and torch.version.cuda: |
| other_args.extend( |
| [ |
| "--cuda-graph-max-bs", |
| "2", |
| "--attention-backend", |
| "flashmla", |
| ] |
| ) |
| |
| cls.process = popen_launch_server( |
| cls.model, |
| cls.base_url, |
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 2, |
| other_args=other_args, |
| ) |
|
|
| @classmethod |
| def tearDownClass(cls): |
| kill_process_tree(cls.process.pid) |
|
|
| def test_gsm8k(self): |
| args = SimpleNamespace( |
| num_shots=5, |
| data_path=None, |
| num_questions=200, |
| max_new_tokens=512, |
| parallel=128, |
| host="http://127.0.0.1", |
| port=int(self.base_url.split(":")[-1]), |
| ) |
| metrics = run_eval_few_shot_gsm8k(args) |
| print(metrics) |
|
|
| self.assertGreater(metrics["accuracy"], 0.60) |
|
|
|
|
| class TestFlashMLAMTP(CustomTestCase): |
| @classmethod |
| def setUpClass(cls): |
| cls.model = "lmsys/sglang-ci-dsv3-test" |
| cls.base_url = DEFAULT_URL_FOR_TEST |
| other_args = ["--trust-remote-code"] |
| if torch.cuda.is_available() and torch.version.cuda: |
| other_args.extend( |
| [ |
| "--cuda-graph-max-bs", |
| "4", |
| "--disable-radix", |
| "--enable-torch-compile", |
| "--torch-compile-max-bs", |
| "1", |
| "--speculative-algorithm", |
| "EAGLE", |
| "--speculative-draft-model-path", |
| "lmsys/sglang-ci-dsv3-test-NextN", |
| "--speculative-num-steps", |
| "2", |
| "--speculative-eagle-topk", |
| "1", |
| "--speculative-num-draft-tokens", |
| "3", |
| "--attention-backend", |
| "flashmla", |
| ] |
| ) |
| |
| cls.process = popen_launch_server( |
| cls.model, |
| cls.base_url, |
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 2, |
| other_args=other_args, |
| ) |
|
|
| @classmethod |
| def tearDownClass(cls): |
| kill_process_tree(cls.process.pid) |
|
|
| def test_gsm8k(self): |
| requests.get(self.base_url + "/flush_cache") |
|
|
| args = SimpleNamespace( |
| num_shots=5, |
| data_path=None, |
| num_questions=200, |
| max_new_tokens=512, |
| parallel=128, |
| host="http://127.0.0.1", |
| port=int(self.base_url.split(":")[-1]), |
| ) |
| metrics = run_eval_few_shot_gsm8k(args) |
| print(metrics) |
|
|
| self.assertGreater(metrics["accuracy"], 0.60) |
|
|
| server_info = requests.get(self.base_url + "/server_info").json() |
| avg_spec_accept_length = server_info["internal_states"][0][ |
| "avg_spec_accept_length" |
| ] |
| print(f"{avg_spec_accept_length=}") |
| self.assertGreater(avg_spec_accept_length, 2.4) |
|
|
|
|
| if __name__ == "__main__": |
| unittest.main() |
|
|