| """ |
| Performance tests for 2-GPU that need large GPUs (H200 80GB) - MoE and Pipeline Parallel tests. |
| """ |
|
|
| import unittest |
|
|
| from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci |
| from sglang.test.test_utils import ( |
| DEFAULT_MOE_MODEL_NAME_FOR_TEST, |
| CustomTestCase, |
| is_in_amd_ci, |
| is_in_ci, |
| run_bench_serving, |
| write_github_step_summary, |
| ) |
|
|
| register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu") |
| register_amd_ci(est_time=1100, suite="stage-b-test-large-2-gpu-amd") |
|
|
|
|
| class TestBenchServing2GPU(CustomTestCase): |
| def test_moe_offline_throughput_default(self): |
| res = run_bench_serving( |
| model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, |
| num_prompts=300, |
| request_rate=float("inf"), |
| other_server_args=["--tp", "2"], |
| ) |
|
|
| if is_in_ci(): |
| write_github_step_summary( |
| f"### test_moe_offline_throughput_default\n" |
| f"Output throughput: {res['output_throughput']:.2f} token/s\n" |
| ) |
| if is_in_amd_ci(): |
| self.assertGreater(res["output_throughput"], 2100) |
| else: |
| self.assertGreater(res["output_throughput"], 2200) |
|
|
| def test_moe_offline_throughput_without_radix_cache(self): |
| res = run_bench_serving( |
| model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, |
| num_prompts=300, |
| request_rate=float("inf"), |
| other_server_args=["--tp", "2", "--disable-radix-cache"], |
| ) |
|
|
| if is_in_ci(): |
| write_github_step_summary( |
| f"### test_moe_offline_throughput_without_radix_cache\n" |
| f"Output throughput: {res['output_throughput']:.2f} token/s\n" |
| ) |
| if is_in_amd_ci(): |
| self.assertGreater(res["output_throughput"], 2100) |
| else: |
| self.assertGreater(res["output_throughput"], 2200) |
|
|
| def test_pp_offline_throughput_default_decode(self): |
| res = run_bench_serving( |
| model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, |
| num_prompts=1000, |
| request_rate=float("inf"), |
| random_input_len=1, |
| random_output_len=1024, |
| other_server_args=["--pp-size", "2"], |
| need_warmup=True, |
| seed=42, |
| ) |
|
|
| if is_in_ci(): |
| write_github_step_summary( |
| f"### test_pp_offline_throughput_default_decode\n" |
| f"Output throughput: {res['output_throughput']:.2f} token/s\n" |
| ) |
| self.assertGreater(res["output_throughput"], 6700) |
|
|
| def test_pp_long_context_prefill(self): |
| res = run_bench_serving( |
| model="meta-llama/Llama-3.3-70B-Instruct", |
| num_prompts=4, |
| request_rate=float("inf"), |
| random_input_len=128000, |
| random_output_len=1, |
| dataset_name="random", |
| other_server_args=[ |
| "--quantization", |
| "fp8", |
| "--pp-size", |
| "2", |
| ] |
| + (["--mem-fraction-static", "0.7"] if is_in_amd_ci() else []), |
| need_warmup=False, |
| seed=42, |
| ) |
|
|
| if is_in_ci(): |
| write_github_step_summary( |
| f"### test_pp_long_context_latency_prefill\n" |
| f"input_throughput: {res['input_throughput']:.2f} ms\n" |
| ) |
| if is_in_amd_ci(): |
| self.assertGreater(res["input_throughput"], 3000) |
| else: |
| self.assertGreater(res["input_throughput"], 4000) |
|
|
|
|
| if __name__ == "__main__": |
| unittest.main() |
|
|