File size: 1,327 Bytes
61ba51e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import unittest

from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.kits.gsm8k_accuracy_kit import GSM8KMixin
from sglang.test.kits.spec_decoding_kit import SpecDecodingMixin
from sglang.test.server_fixtures.default_fixture import DefaultServerBase

register_cuda_ci(est_time=200, suite="stage-c-test-8-gpu-h200")


class TestMiMoV2Flash(GSM8KMixin, SpecDecodingMixin, DefaultServerBase):
    gsm8k_accuracy_thres = 0.75
    gsm8k_num_questions = 1319
    gsm8k_parallel = 1319
    model = "XiaomiMiMo/MiMo-V2-Flash"

    other_args = [
        "--tp",
        "4",
        "--dp",
        "2",
        "--enable-dp-attention",
        "--trust-remote-code",
        "--attention-backend",
        "fa3",
        "--max-running-requests",
        "128",
        "--cuda-graph-max-bs",
        "64",
        "--mem-fraction-static",
        "0.75",
        "--speculative-algorithm",
        "EAGLE",
        "--speculative-num-steps",
        "3",
        "--speculative-eagle-topk",
        "1",
        "--speculative-num-draft-tokens",
        "4",
        "--enable-multi-layer-eagle",
        "--model-loader-extra-config",
        '{"enable_multithread_load": true,"num_threads": 64}',
    ]

    bs_1_speed_thres = 170
    accept_length_thres = 3.2


if __name__ == "__main__":
    unittest.main()