""" Performance tests for single GPU that need H200 (80GB) - FP8 and EAGLE tests. """ import unittest from sglang.srt.utils import is_hip from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci from sglang.test.test_utils import ( DEFAULT_DRAFT_MODEL_EAGLE, DEFAULT_MODEL_NAME_FOR_TEST_FP8, DEFAULT_TARGET_MODEL_EAGLE, CustomTestCase, is_in_amd_ci, is_in_ci, run_bench_serving, write_github_step_summary, ) register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu") register_amd_ci(est_time=300, suite="stage-b-test-large-1-gpu-amd") class TestBenchServing1GPULarge(CustomTestCase): def test_offline_throughput_default_fp8(self): res = run_bench_serving( model=DEFAULT_MODEL_NAME_FOR_TEST_FP8, num_prompts=500, request_rate=float("inf"), other_server_args=[], ) if is_in_ci(): write_github_step_summary( f"### test_offline_throughput_default_fp8\n" f"Output throughput: {res['output_throughput']:.2f} token/s\n" ) if is_in_amd_ci(): self.assertGreater(res["output_throughput"], 3500) else: self.assertGreater(res["output_throughput"], 4300) @unittest.skipIf(is_hip(), "Skip Eagle test for ROCm") def test_online_latency_eagle(self): res = run_bench_serving( model=DEFAULT_TARGET_MODEL_EAGLE, num_prompts=300, request_rate=8, sharegpt_context_len=3072, disable_ignore_eos=True, dataset_name="sharegpt", other_server_args=[ "--speculative-algorithm", "EAGLE", "--speculative-draft-model-path", DEFAULT_DRAFT_MODEL_EAGLE, "--speculative-num-steps", "5", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "16", "--mem-fraction-static", "0.7", ], need_warmup=True, seed=42, ) if is_in_ci(): write_github_step_summary( f"### test_online_latency_eagle\n" f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" f"accept_length: {res['accept_length']:.2f} \n" ) if is_in_amd_ci(): self.assertLess(res["median_e2e_latency_ms"], 1800) else: self.assertLess(res["median_e2e_latency_ms"], 900) self.assertGreater(res["accept_length"], 3.0) if __name__ == "__main__": unittest.main()