Hanrui / sglang /test /registered /mla /test_flashmla.py

Add files using upload-large-folder tool

61ba51e verified about 2 months ago

4.22 kB

	"""
	Usage:
	python3 test/registered/mla/test_flashmla.py
	"""

	import unittest
	from types import SimpleNamespace

	import requests
	import torch

	from sglang.srt.utils import kill_process_tree
	from sglang.test.ci.ci_register import register_cuda_ci
	from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
	from sglang.test.test_utils import (
	DEFAULT_MODEL_NAME_FOR_TEST_MLA,
	DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
	DEFAULT_URL_FOR_TEST,
	CustomTestCase,
	popen_launch_server,
	)

	# FlashMLA attention backend tests with MTP speculative decoding
	register_cuda_ci(est_time=284, suite="stage-b-test-large-1-gpu")


	class TestFlashMLAAttnBackend(unittest.TestCase):
	@classmethod
	def setUpClass(cls):
	cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
	cls.base_url = DEFAULT_URL_FOR_TEST
	other_args = ["--trust-remote-code"]
	if torch.cuda.is_available() and torch.version.cuda:
	other_args.extend(
	[
	"--cuda-graph-max-bs",
	"2",
	"--attention-backend",
	"flashmla",
	]
	)
	# Use longer timeout for DeepGEMM JIT compilation which can take 10-20 minutes
	cls.process = popen_launch_server(
	cls.model,
	cls.base_url,
	timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 2,
	other_args=other_args,
	)

	@classmethod
	def tearDownClass(cls):
	kill_process_tree(cls.process.pid)

	def test_gsm8k(self):
	args = SimpleNamespace(
	num_shots=5,
	data_path=None,
	num_questions=200,
	max_new_tokens=512,
	parallel=128,
	host="http://127.0.0.1",
	port=int(self.base_url.split(":")[-1]),
	)
	metrics = run_eval_few_shot_gsm8k(args)
	print(metrics)

	self.assertGreater(metrics["accuracy"], 0.60)


	class TestFlashMLAMTP(CustomTestCase):
	@classmethod
	def setUpClass(cls):
	cls.model = "lmsys/sglang-ci-dsv3-test"
	cls.base_url = DEFAULT_URL_FOR_TEST
	other_args = ["--trust-remote-code"]
	if torch.cuda.is_available() and torch.version.cuda:
	other_args.extend(
	[
	"--cuda-graph-max-bs",
	"4",
	"--disable-radix",
	"--enable-torch-compile",
	"--torch-compile-max-bs",
	"1",
	"--speculative-algorithm",
	"EAGLE",
	"--speculative-draft-model-path",
	"lmsys/sglang-ci-dsv3-test-NextN",
	"--speculative-num-steps",
	"2",
	"--speculative-eagle-topk",
	"1",
	"--speculative-num-draft-tokens",
	"3",
	"--attention-backend",
	"flashmla",
	]
	)
	# Use longer timeout for DeepGEMM JIT compilation which can take 10-20 minutes
	cls.process = popen_launch_server(
	cls.model,
	cls.base_url,
	timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 2,
	other_args=other_args,
	)

	@classmethod
	def tearDownClass(cls):
	kill_process_tree(cls.process.pid)

	def test_gsm8k(self):
	requests.get(self.base_url + "/flush_cache")

	args = SimpleNamespace(
	num_shots=5,
	data_path=None,
	num_questions=200,
	max_new_tokens=512,
	parallel=128,
	host="http://127.0.0.1",
	port=int(self.base_url.split(":")[-1]),
	)
	metrics = run_eval_few_shot_gsm8k(args)
	print(metrics)

	self.assertGreater(metrics["accuracy"], 0.60)

	server_info = requests.get(self.base_url + "/server_info").json()
	avg_spec_accept_length = server_info["internal_states"][0][
	"avg_spec_accept_length"
	]
	print(f"{avg_spec_accept_length=}")
	self.assertGreater(avg_spec_accept_length, 2.4)


	if __name__ == "__main__":
	unittest.main()