File size: 3,294 Bytes
7feac49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def _test_client(port: int, print_logprobs: bool = False, test_vlm: bool = False):
    import requests
    import time
    import aiohttp
    from pprint import pprint
    from swift.llm import InferClient, InferRequest, RequestConfig

    infer_client = InferClient(port=port)

    while True:
        try:
            models = infer_client.models
            print(f'models: {models}')
        except aiohttp.ClientConnectorError:
            time.sleep(5)
            continue
        break

    if test_vlm:
        query = '这是什么'
        # http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png
        messages = [{
            'role':
            'user',
            'content': [
                {
                    'type': 'text',
                    'text': '这是什么'
                },
                {
                    'type': 'image_url',
                    'image_url': {
                        'url': 'cat.png'
                    }
                },
            ]
        }]
    else:
        query = '123*234=?'
        messages = [{'role': 'user', 'content': query}]

    infer_request = InferRequest(messages=messages)
    request_config = RequestConfig(seed=42, max_tokens=256, temperature=0.8, logprobs=True, top_logprobs=5)

    resp = infer_client.infer([infer_request], request_config=request_config)[0]
    response = resp.choices[0].message.content
    print(f'query: {query}')
    print(f'response: {response}')
    if print_logprobs:
        pprint(resp.choices[0].logprobs)

    request_config = RequestConfig(
        stream=True, seed=42, max_tokens=256, temperature=0.8, top_k=20, top_p=0.8, logprobs=True, top_logprobs=5)
    gen_list = infer_client.infer([infer_request], request_config=request_config)
    print(f'query: {query}')
    print('response: ', end='')
    for chunk in gen_list[0]:
        print(chunk.choices[0].delta.content, end='', flush=True)
        if print_logprobs and chunk.choices[0].logprobs is not None:
            pprint(chunk.choices[0].logprobs)
    print()


def _test(infer_backend, test_vlm: bool = False):
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    from swift.llm import DeployArguments
    from swift.llm import deploy_main
    import multiprocessing
    mp = multiprocessing.get_context('spawn')
    model = 'Qwen/Qwen2-VL-7B-Instruct' if test_vlm else 'Qwen/Qwen2-7B-Instruct'
    args = DeployArguments(model=model, infer_backend=infer_backend, verbose=False)
    process = mp.Process(target=deploy_main, args=(args, ))
    process.start()
    _test_client(args.port, True, test_vlm)
    process.terminate()


def test_vllm_vlm():
    _test('vllm', test_vlm=True)


def test_vllm():
    _test('vllm')


def test_lmdeploy():
    _test('lmdeploy')


def test_pt():
    _test('pt')


def test_vllm_origin():
    import os
    import subprocess
    import sys
    from modelscope import snapshot_download
    model_dir = snapshot_download('Qwen/Qwen2-7B-Instruct')
    args = [sys.executable, '-m', 'vllm.entrypoints.openai.api_server', '--model', model_dir]
    process = subprocess.Popen(args)
    _test_client(8000)
    process.terminate()


if __name__ == '__main__':
    # test_vllm_origin()
    # test_vllm()
    test_vllm_vlm()
    # test_lmdeploy()
    # test_pt()