""" Performance tests for 2-GPU that need large GPUs (H200 80GB) - MoE and Pipeline Parallel tests. """ import unittest from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci from sglang.test.test_utils import ( DEFAULT_MOE_MODEL_NAME_FOR_TEST, CustomTestCase, is_in_amd_ci, is_in_ci, run_bench_serving, write_github_step_summary, ) register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu") register_amd_ci(est_time=1100, suite="stage-b-test-large-2-gpu-amd") class TestBenchServing2GPU(CustomTestCase): def test_moe_offline_throughput_default(self): res = run_bench_serving( model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, num_prompts=300, request_rate=float("inf"), other_server_args=["--tp", "2"], ) if is_in_ci(): write_github_step_summary( f"### test_moe_offline_throughput_default\n" f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 2100) else: self.assertGreater(res["output_throughput"], 2200) def test_moe_offline_throughput_without_radix_cache(self): res = run_bench_serving( model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, num_prompts=300, request_rate=float("inf"), other_server_args=["--tp", "2", "--disable-radix-cache"], ) if is_in_ci(): write_github_step_summary( f"### test_moe_offline_throughput_without_radix_cache\n" f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 2100) else: self.assertGreater(res["output_throughput"], 2200) def test_pp_offline_throughput_default_decode(self): res = run_bench_serving( model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, num_prompts=1000, request_rate=float("inf"), random_input_len=1, random_output_len=1024, other_server_args=["--pp-size", "2"], need_warmup=True, seed=42, ) if is_in_ci(): write_github_step_summary( f"### test_pp_offline_throughput_default_decode\n" f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) self.assertGreater(res["output_throughput"], 6700) def test_pp_long_context_prefill(self): res = run_bench_serving( model="meta-llama/Llama-3.3-70B-Instruct", num_prompts=4, request_rate=float("inf"), random_input_len=128000, random_output_len=1, dataset_name="random", other_server_args=[ "--quantization", "fp8", "--pp-size", "2", ] + (["--mem-fraction-static", "0.7"] if is_in_amd_ci() else []), need_warmup=False, seed=42, ) if is_in_ci(): write_github_step_summary( f"### test_pp_long_context_latency_prefill\n" f"input_throughput: {res['input_throughput']:.2f} ms\n" ) if is_in_amd_ci(): self.assertGreater(res["input_throughput"], 3000) else: self.assertGreater(res["input_throughput"], 4000) if __name__ == "__main__": unittest.main()