| import unittest |
| from types import SimpleNamespace |
|
|
| from sglang.srt.utils import is_hip, kill_process_tree |
| from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci |
| from sglang.test.run_eval import run_eval |
| from sglang.test.test_utils import ( |
| DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8, |
| DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8, |
| DEFAULT_MODEL_NAME_FOR_TEST, |
| DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| DEFAULT_URL_FOR_TEST, |
| CustomTestCase, |
| popen_launch_server, |
| ) |
|
|
| register_cuda_ci(est_time=250, suite="stage-b-test-large-1-gpu") |
| register_amd_ci(est_time=600, suite="stage-b-test-small-1-gpu-amd") |
|
|
|
|
| class TestEvalFP8Accuracy(CustomTestCase): |
| @classmethod |
| def setUpClass(cls): |
| cls.model = DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 |
| cls.base_url = DEFAULT_URL_FOR_TEST |
| cls.process = popen_launch_server( |
| cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH |
| ) |
|
|
| @classmethod |
| def tearDownClass(cls): |
| kill_process_tree(cls.process.pid) |
|
|
| def test_mmlu(self): |
| args = SimpleNamespace( |
| base_url=self.base_url, |
| model=self.model, |
| eval_name="mmlu", |
| num_examples=64, |
| num_threads=32, |
| temperature=0.1, |
| ) |
|
|
| metrics = run_eval(args) |
| if is_hip(): |
| |
| self.assertGreaterEqual(metrics["score"], 0.60) |
| else: |
| self.assertGreaterEqual(metrics["score"], 0.60) |
|
|
|
|
| class TestEvalFP8DynamicQuantAccuracy(CustomTestCase): |
|
|
| def _run_test(self, model, other_args, expected_score): |
| base_url = DEFAULT_URL_FOR_TEST |
| other_args = other_args or [] |
|
|
| process = popen_launch_server( |
| model, |
| base_url, |
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| other_args=other_args, |
| ) |
|
|
| try: |
| args = SimpleNamespace( |
| base_url=base_url, |
| model=model, |
| eval_name="mmlu", |
| num_examples=64, |
| num_threads=32, |
| temperature=0.1, |
| ) |
|
|
| metrics = run_eval(args) |
| self.assertGreaterEqual(metrics["score"], expected_score) |
| finally: |
| kill_process_tree(process.pid) |
|
|
| def test_mmlu_offline_only(self): |
| """Test with offline quantization only.""" |
| self._run_test( |
| model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8, |
| other_args=[], |
| expected_score=0.64, |
| ) |
|
|
| def test_mmlu_offline_and_online_override(self): |
| """Test with both offline and online quantization.""" |
| self._run_test( |
| model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8, |
| other_args=["--quantization", "w8a8_fp8"], |
| |
| |
| expected_score=0.64, |
| ) |
|
|
| def test_mmlu_online_only(self): |
| """Test with online quantization only.""" |
| self._run_test( |
| model=DEFAULT_MODEL_NAME_FOR_TEST, |
| |
| |
| other_args=["--quantization", "w8a8_fp8"], |
| expected_score=0.64, |
| ) |
|
|
| def test_mmlu_fp16_baseline(self): |
| """Test with unquantized fp16 baseline.""" |
| self._run_test( |
| model=DEFAULT_MODEL_NAME_FOR_TEST, |
| other_args=[], |
| expected_score=0.64, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| unittest.main() |
|
|