yuccaaa commited on
Commit
91ba179
·
verified ·
1 Parent(s): ff5a13c

Upload ms-swift/examples/deploy/client/llm/chat/swift_client.py with huggingface_hub

Browse files
ms-swift/examples/deploy/client/llm/chat/swift_client.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ from typing import List
4
+
5
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
6
+
7
+
8
+ def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']):
9
+ request_config = RequestConfig(max_tokens=512, temperature=0)
10
+ metric = InferStats()
11
+
12
+ resp_list = engine.infer(infer_requests, request_config, metrics=[metric])
13
+ # # The asynchronous interface below is equivalent to the synchronous interface above.
14
+ # async def _run():
15
+ # tasks = [engine.infer_async(infer_request, request_config) for infer_request in infer_requests]
16
+ # return await asyncio.gather(*tasks)
17
+ # resp_list = asyncio.run(_run())
18
+
19
+ query0 = infer_requests[0].messages[0]['content']
20
+ print(f'query0: {query0}')
21
+ print(f'response0: {resp_list[0].choices[0].message.content}')
22
+ print(f'metric: {metric.compute()}')
23
+
24
+
25
+ def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'):
26
+ request_config = RequestConfig(max_tokens=512, temperature=0, stream=True)
27
+ metric = InferStats()
28
+ gen_list = engine.infer([infer_request], request_config, metrics=[metric])
29
+ query = infer_request.messages[0]['content']
30
+ print(f'query: {query}\nresponse: ', end='')
31
+ for resp in gen_list[0]:
32
+ if resp is None:
33
+ continue
34
+ print(resp.choices[0].delta.content, end='', flush=True)
35
+ print()
36
+ print(f'metric: {metric.compute()}')
37
+
38
+
39
+ def run_client(host: str = '127.0.0.1', port: int = 8000):
40
+ engine = InferClient(host=host, port=port)
41
+ print(f'models: {engine.models}')
42
+ # Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset.
43
+ dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], seed=42)[0]
44
+ print(f'dataset: {dataset}')
45
+ infer_requests = [InferRequest(**data) for data in dataset]
46
+ infer_batch(engine, infer_requests)
47
+
48
+ messages = [{'role': 'user', 'content': 'who are you?'}]
49
+ infer_stream(engine, InferRequest(messages=messages))
50
+
51
+
52
+ if __name__ == '__main__':
53
+ from swift.llm import (InferEngine, InferRequest, InferClient, RequestConfig, load_dataset, run_deploy,
54
+ DeployArguments)
55
+ from swift.plugin import InferStats
56
+ # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
57
+ with run_deploy(
58
+ DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1,
59
+ infer_backend='vllm')) as port:
60
+ run_client(port=port)