Hanrui / sglang /test /registered /8-gpu-models /test_mimo_models.py
Lekr0's picture
Add files using upload-large-folder tool
61ba51e verified
import unittest
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.kits.gsm8k_accuracy_kit import GSM8KMixin
from sglang.test.kits.spec_decoding_kit import SpecDecodingMixin
from sglang.test.server_fixtures.default_fixture import DefaultServerBase
register_cuda_ci(est_time=200, suite="stage-c-test-8-gpu-h200")
class TestMiMoV2Flash(GSM8KMixin, SpecDecodingMixin, DefaultServerBase):
gsm8k_accuracy_thres = 0.75
gsm8k_num_questions = 1319
gsm8k_parallel = 1319
model = "XiaomiMiMo/MiMo-V2-Flash"
other_args = [
"--tp",
"4",
"--dp",
"2",
"--enable-dp-attention",
"--trust-remote-code",
"--attention-backend",
"fa3",
"--max-running-requests",
"128",
"--cuda-graph-max-bs",
"64",
"--mem-fraction-static",
"0.75",
"--speculative-algorithm",
"EAGLE",
"--speculative-num-steps",
"3",
"--speculative-eagle-topk",
"1",
"--speculative-num-draft-tokens",
"4",
"--enable-multi-layer-eagle",
"--model-loader-extra-config",
'{"enable_multithread_load": true,"num_threads": 64}',
]
bs_1_speed_thres = 170
accept_length_thres = 3.2
if __name__ == "__main__":
unittest.main()