Hanrui / sglang /test /registered /distributed /test_disaggregation_pp.py

Add files using upload-large-folder tool

61ba51e verified about 1 month ago

7.01 kB

	import time
	import unittest
	from types import SimpleNamespace

	from sglang.test.ci.ci_register import register_cuda_ci
	from sglang.test.few_shot_gsm8k import run_eval
	from sglang.test.server_fixtures.disaggregation_fixture import (
	PDDisaggregationServerBase,
	)
	from sglang.test.test_utils import (
	DEFAULT_MODEL_NAME_FOR_TEST,
	DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
	popen_launch_pd_server,
	try_cached_model,
	)

	register_cuda_ci(est_time=180, suite="stage-c-test-8-gpu-h20")


	class TestDisaggregationPrefillPPAccuracy(PDDisaggregationServerBase):
	@classmethod
	def setUpClass(cls):
	super().setUpClass()
	cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)

	# Non blocking start servers
	cls.start_prefill()
	cls.start_decode()

	# Block until both
	cls.wait_server_ready(cls.prefill_url + "/health", process=cls.process_prefill)
	cls.wait_server_ready(cls.decode_url + "/health", process=cls.process_decode)

	cls.launch_lb()

	@classmethod
	def start_prefill(cls):
	prefill_args = [
	"--trust-remote-code",
	"--disaggregation-mode",
	"prefill",
	"--tp-size",
	"2",
	"--pp-size",
	"2",
	"--disable-overlap-schedule",
	]
	prefill_args += cls.transfer_backend + cls.rdma_devices
	cls.process_prefill = popen_launch_pd_server(
	cls.model,
	cls.prefill_url,
	timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
	other_args=prefill_args,
	)

	@classmethod
	def start_decode(cls):
	decode_args = [
	"--trust-remote-code",
	"--disaggregation-mode",
	"decode",
	"--tp-size",
	"2",
	"--base-gpu-id",
	"4",
	]
	decode_args += cls.transfer_backend + cls.rdma_devices
	cls.process_decode = popen_launch_pd_server(
	cls.model,
	cls.decode_url,
	timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
	other_args=decode_args,
	)

	def test_gsm8k(self):
	args = SimpleNamespace(
	num_shots=5,
	data_path=None,
	num_questions=200,
	max_new_tokens=512,
	parallel=128,
	host=f"http://{self.base_host}",
	port=int(self.lb_port),
	)
	metrics = run_eval(args)
	print(f"{metrics=}")

	self.assertGreater(metrics["accuracy"], 0.24)
	# Wait a little bit so that the memory check happens.
	time.sleep(5)


	class TestDisaggregationPrefillPPDynamicChunkAccuracy(PDDisaggregationServerBase):
	@classmethod
	def setUpClass(cls):
	super().setUpClass()
	cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)

	# Non blocking start servers
	cls.start_prefill()
	cls.start_decode()

	# Block until both
	cls.wait_server_ready(cls.prefill_url + "/health", process=cls.process_prefill)
	cls.wait_server_ready(cls.decode_url + "/health", process=cls.process_decode)

	cls.launch_lb()

	@classmethod
	def start_prefill(cls):
	prefill_args = [
	"--trust-remote-code",
	"--disaggregation-mode",
	"prefill",
	"--tp-size",
	"2",
	"--pp-size",
	"2",
	"--disable-overlap-schedule",
	"--enable-dynamic-chunking",
	]
	prefill_args += cls.transfer_backend + cls.rdma_devices
	cls.process_prefill = popen_launch_pd_server(
	cls.model,
	cls.prefill_url,
	timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
	other_args=prefill_args,
	)

	@classmethod
	def start_decode(cls):
	decode_args = [
	"--trust-remote-code",
	"--disaggregation-mode",
	"decode",
	"--tp-size",
	"2",
	"--base-gpu-id",
	"4",
	]
	decode_args += cls.transfer_backend + cls.rdma_devices
	cls.process_decode = popen_launch_pd_server(
	cls.model,
	cls.decode_url,
	timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
	other_args=decode_args,
	)

	def test_gsm8k(self):
	args = SimpleNamespace(
	num_shots=5,
	data_path=None,
	num_questions=200,
	max_new_tokens=512,
	parallel=128,
	host=f"http://{self.base_host}",
	port=int(self.lb_port),
	)
	metrics = run_eval(args)
	print(f"{metrics=}")

	self.assertGreater(metrics["accuracy"], 0.24)
	# Wait a little bit so that the memory check happens.
	time.sleep(5)


	class TestDisaggregationDecodePPAccuracy(PDDisaggregationServerBase):
	@classmethod
	def setUpClass(cls):
	super().setUpClass()
	cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)

	# Non blocking start servers
	cls.start_prefill()
	cls.start_decode()

	# Block until both
	cls.wait_server_ready(cls.prefill_url + "/health", process=cls.process_prefill)
	cls.wait_server_ready(cls.decode_url + "/health", process=cls.process_decode)

	cls.launch_lb()

	@classmethod
	def start_prefill(cls):
	prefill_args = [
	"--trust-remote-code",
	"--disaggregation-mode",
	"prefill",
	"--tp-size",
	"2",
	"--pp-size",
	"2",
	"--disable-overlap-schedule",
	]
	prefill_args += cls.transfer_backend + cls.rdma_devices
	cls.process_prefill = popen_launch_pd_server(
	cls.model,
	cls.prefill_url,
	timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
	other_args=prefill_args,
	)

	@classmethod
	def start_decode(cls):
	decode_args = [
	"--trust-remote-code",
	"--disaggregation-mode",
	"decode",
	"--tp-size",
	"2",
	"--pp-size",
	"2",
	"--base-gpu-id",
	"4",
	]
	decode_args += cls.transfer_backend + cls.rdma_devices
	cls.process_decode = popen_launch_pd_server(
	cls.model,
	cls.decode_url,
	timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
	other_args=decode_args,
	)

	def test_gsm8k(self):
	args = SimpleNamespace(
	num_shots=5,
	data_path=None,
	num_questions=200,
	max_new_tokens=512,
	parallel=128,
	host=f"http://{self.base_host}",
	port=int(self.lb_port),
	)
	metrics = run_eval(args)
	print(f"{metrics=}")

	self.assertGreater(metrics["accuracy"], 0.24)
	# Wait a little bit so that the memory check happens.
	time.sleep(5)


	if __name__ == "__main__":
	unittest.main()