| |
|
|
| import dataclasses |
| import multiprocessing as mp |
| import unittest |
| from types import SimpleNamespace |
| from typing import List |
|
|
| import torch |
|
|
| from sglang.srt.utils import is_hip, kill_process_tree |
| from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci |
| from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner, check_close_model_outputs |
| from sglang.test.test_utils import ( |
| DEFAULT_MODEL_NAME_FOR_TEST, |
| DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| DEFAULT_URL_FOR_TEST, |
| CustomTestCase, |
| is_in_ci, |
| popen_launch_server, |
| ) |
|
|
| register_cuda_ci(est_time=245, suite="stage-b-test-small-1-gpu") |
| register_amd_ci(est_time=320, suite="stage-b-test-small-1-gpu-amd") |
|
|
|
|
| class TestTransformersFallbackEndpoint(CustomTestCase): |
| @classmethod |
| def setUpClass(cls): |
| cls.model = DEFAULT_MODEL_NAME_FOR_TEST |
| cls.base_url = DEFAULT_URL_FOR_TEST |
| cls.process = popen_launch_server( |
| cls.model, |
| cls.base_url, |
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| other_args=["--model-impl", "transformers"], |
| ) |
| cls.mmlu_lower_bound = 0.65 |
| cls.gsm8k_lower_bound = 0.65 |
|
|
| @classmethod |
| def tearDownClass(cls): |
| kill_process_tree(cls.process.pid) |
|
|
| def test_mmlu(self): |
| args = SimpleNamespace( |
| base_url=self.base_url, |
| model=self.model, |
| eval_name="mmlu", |
| num_examples=64, |
| num_threads=32, |
| ) |
| from sglang.test.run_eval import run_eval |
|
|
| metrics = run_eval(args) |
| self.assertGreaterEqual(metrics["score"], self.mmlu_lower_bound) |
|
|
| def test_gsm8k(self): |
| args = SimpleNamespace( |
| num_shots=5, |
| data_path=None, |
| num_questions=200, |
| max_new_tokens=512, |
| parallel=128, |
| host="http://127.0.0.1", |
| port=int(self.base_url.split(":")[-1]), |
| ) |
| from sglang.test.few_shot_gsm8k import run_eval |
|
|
| metrics = run_eval(args) |
| print(f"{metrics=}") |
| self.assertGreater(metrics["accuracy"], self.gsm8k_lower_bound) |
|
|
|
|
| @unittest.skipIf(is_hip(), "TorchAO int4wo quantization is not supported on AMD GPUs") |
| class TestTransformersFallbackTorchAO(TestTransformersFallbackEndpoint): |
| @classmethod |
| def setUpClass(cls): |
| cls.model = DEFAULT_MODEL_NAME_FOR_TEST |
| cls.base_url = DEFAULT_URL_FOR_TEST |
| cls.process = popen_launch_server( |
| cls.model, |
| cls.base_url, |
| timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| other_args=[ |
| "--model-impl", |
| "transformers", |
| "--torchao-config", |
| "int4wo-128", |
| ], |
| ) |
| cls.mmlu_lower_bound = 0.65 |
| cls.gsm8k_lower_bound = 0.65 |
|
|
|
|
| @dataclasses.dataclass |
| class ModelCase: |
| model_path: str |
| tp_size: int = 1 |
| prefill_tolerance: float = 5e-2 |
| decode_tolerance: float = 5e-2 |
| rouge_l_tolerance: float = 1 |
| skip_long_prompt: bool = False |
| trust_remote_code: bool = False |
| torchao_config: str = None |
| torch_dtype: torch.dtype = torch.float16 |
|
|
|
|
| |
| CI_MODELS = [ |
| ModelCase(DEFAULT_MODEL_NAME_FOR_TEST), |
| ] |
|
|
| ALL_OTHER_MODELS = [ |
| ModelCase(DEFAULT_MODEL_NAME_FOR_TEST, tp_size=2), |
| ] |
|
|
|
|
| class TestTransformersFallbackEngine(CustomTestCase): |
| @classmethod |
| def setUpClass(cls): |
| mp.set_start_method("spawn", force=True) |
|
|
| def assert_close_logits_and_output_strs( |
| self, |
| prompts: List[str], |
| model_case: ModelCase, |
| ) -> None: |
| model_path = model_case.model_path |
| max_new_tokens = 32 |
| |
| with SRTRunner( |
| model_path, |
| tp_size=model_case.tp_size, |
| torch_dtype=model_case.torch_dtype, |
| model_type="generation", |
| model_impl="transformers", |
| trust_remote_code=model_case.trust_remote_code, |
| torchao_config=model_case.torchao_config, |
| ) as srt_runner: |
| srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens) |
|
|
| with SRTRunner( |
| model_path, |
| tp_size=model_case.tp_size, |
| torch_dtype=model_case.torch_dtype, |
| model_type="generation", |
| trust_remote_code=model_case.trust_remote_code, |
| torchao_config=model_case.torchao_config, |
| ) as srt_runner: |
| srt_transformers_outputs = srt_runner.forward( |
| prompts, max_new_tokens=max_new_tokens |
| ) |
|
|
| check_close_model_outputs( |
| hf_outputs=srt_transformers_outputs, |
| srt_outputs=srt_outputs, |
| prefill_tolerance=model_case.prefill_tolerance, |
| decode_tolerance=model_case.decode_tolerance, |
| rouge_l_tolerance=model_case.rouge_l_tolerance, |
| debug_text=f"model_path={model_path} prompts={prompts}", |
| ) |
|
|
| def test_ci_models(self): |
| for model_case in CI_MODELS: |
| |
| prompts = DEFAULT_PROMPTS |
| if model_case.skip_long_prompt: |
| prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000] |
| |
| self.assert_close_logits_and_output_strs(prompts, model_case) |
|
|
| def test_others(self): |
| if is_in_ci(): |
| return |
|
|
| |
| prompts = DEFAULT_PROMPTS |
| for model_case in ALL_OTHER_MODELS: |
| if model_case.skip_long_prompt: |
| prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000] |
|
|
| |
| self.assert_close_logits_and_output_strs(prompts, model_case) |
|
|
|
|
| if __name__ == "__main__": |
| unittest.main() |
|
|