diff --git a/.gitattributes b/.gitattributes index 12e94bade155bdc4b7c9850ec7b4c3ce49106f01..30e76d4df4681b8a314c710a62bd10ad6d5e9f8b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -38,3 +38,6 @@ asset/banner.png filter=lfs diff=lfs merge=lfs -text docs/resources/grpo_clevr_count.png filter=lfs diff=lfs merge=lfs -text docs/resources/grpo_code.png filter=lfs diff=lfs merge=lfs -text docs/resources/dpo_data.png filter=lfs diff=lfs merge=lfs -text +docs/resources/grpo_countdown_1.png filter=lfs diff=lfs merge=lfs -text +docs/resources/grpo_countdown.png filter=lfs diff=lfs merge=lfs -text +docs/resources/grpo_geoqa.png filter=lfs diff=lfs merge=lfs -text diff --git a/docs/resources/grpo_countdown.png b/docs/resources/grpo_countdown.png new file mode 100644 index 0000000000000000000000000000000000000000..af2ce0c0ce08cb3d8b152f6bafe2c15c056dcd72 --- /dev/null +++ b/docs/resources/grpo_countdown.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b55fe6864e0c92549940d6989d92b3ab22be38a035cff3694525252737fc91e +size 2226402 diff --git a/docs/resources/grpo_countdown_1.png b/docs/resources/grpo_countdown_1.png new file mode 100644 index 0000000000000000000000000000000000000000..819ab3d992619b077d75e6946d4637b030b8d213 --- /dev/null +++ b/docs/resources/grpo_countdown_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78dc3ce1cd541e76f2c557dea3aff06b278bb3b5413946a92c584cf42c1369f +size 785044 diff --git a/docs/resources/grpo_geoqa.png b/docs/resources/grpo_geoqa.png new file mode 100644 index 0000000000000000000000000000000000000000..071d9b8eacb301bd96e30c2eff1471e68a7632a8 --- /dev/null +++ b/docs/resources/grpo_geoqa.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71246376b16f2ff288542dca2ff31532b16ef99f5e862797463d548e447e1f8d +size 2238084 diff --git a/examples/infer/demo_agent.py b/examples/infer/demo_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..c4867c11e59e01a89ee9920c80a28d86dc699efe --- /dev/null +++ b/examples/infer/demo_agent.py @@ -0,0 +1,118 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +# os.environ['SWIFT_DEBUG'] = '1' + + +def infer(engine: 'InferEngine', infer_request: 'InferRequest'): + stop = [engine.default_template.agent_template.keyword.observation] # compat react_en + request_config = RequestConfig(max_tokens=512, temperature=0, stop=stop) + resp_list = engine.infer([infer_request], request_config) + query = infer_request.messages[0]['content'] + response = resp_list[0].choices[0].message.content + print(f'query: {query}') + print(f'response: {response}') + print(f'tool_calls: {resp_list[0].choices[0].message.tool_calls}') + + tool = '{"temperature": 32, "condition": "Sunny", "humidity": 50}' + print(f'tool_response: {tool}') + infer_request.messages += [{'role': 'assistant', 'content': response}, {'role': 'tool', 'content': tool}] + resp_list = engine.infer([infer_request], request_config) + response2 = resp_list[0].choices[0].message.content + print(f'response2: {response2}') + + +def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'): + stop = [engine.default_template.agent_template.keyword.observation] + request_config = RequestConfig(max_tokens=512, temperature=0, stream=True, stop=stop) + gen_list = engine.infer([infer_request], request_config) + query = infer_request.messages[0]['content'] + response = '' + print(f'query: {query}\nresponse: ', end='') + for resp in gen_list[0]: + if resp is None: + continue + delta = resp.choices[0].delta.content + response += delta + print(delta, end='', flush=True) + print() + print(f'tool_calls: {resp.choices[0].delta.tool_calls}') + + tool = '{"temperature": 32, "condition": "Sunny", "humidity": 50}' + print(f'tool_response: {tool}\nresponse2: ', end='') + infer_request.messages += [{'role': 'assistant', 'content': response}, {'role': 'tool', 'content': tool}] + gen_list = engine.infer([infer_request], request_config) + for resp in gen_list[0]: + if resp is None: + continue + print(resp.choices[0].delta.content, end='', flush=True) + print() + + +def get_infer_request(): + return InferRequest( + messages=[{ + 'role': 'user', + 'content': "How's the weather in Beijing today?" + }], + tools=[{ + 'name': 'get_current_weather', + 'description': 'Get the current weather in a given location', + 'parameters': { + 'type': 'object', + 'properties': { + 'location': { + 'type': 'string', + 'description': 'The city and state, e.g. San Francisco, CA' + }, + 'unit': { + 'type': 'string', + 'enum': ['celsius', 'fahrenheit'] + } + }, + 'required': ['location'] + } + }]) + + +def infer_continue_generate(engine): + # Continue generating after the assistant message. + infer_request = InferRequest(messages=[{ + 'role': 'user', + 'content': 'How is the weather today?' + }, { + 'role': 'assistant', + 'content': 'It is sunny today, ' + }, { + 'role': 'assistant', + 'content': None + }]) + request_config = RequestConfig(max_tokens=512, temperature=0) + resp_list = engine.infer([infer_request], request_config) + response = resp_list[0].choices[0].message.content + print(f'response: {response}') + + +if __name__ == '__main__': + from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig + from swift.plugin import agent_templates + model = 'Qwen/Qwen2.5-1.5B-Instruct' + infer_backend = 'pt' + + if infer_backend == 'pt': + engine = PtEngine(model, max_batch_size=64) + elif infer_backend == 'vllm': + from swift.llm import VllmEngine + engine = VllmEngine(model, max_model_len=8192) + elif infer_backend == 'lmdeploy': + from swift.llm import LmdeployEngine + engine = LmdeployEngine(model) + + # agent_template = agent_templates['hermes']() # react_en/qwen_en/qwen_en_parallel + # engine.default_template.agent_template = agent_template + + infer(engine, get_infer_request()) + infer_stream(engine, get_infer_request()) + + # infer_continue_generate(engine) diff --git a/examples/infer/demo_bert.py b/examples/infer/demo_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..852f970c25604e0398a6e2baa41f2fcd1d2da768 --- /dev/null +++ b/examples/infer/demo_bert.py @@ -0,0 +1,53 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +from typing import List + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']): + resp_list = engine.infer(infer_requests) + query0 = infer_requests[0].messages[0]['content'] + query1 = infer_requests[1].messages[0]['content'] + print(f'query0: {query0}') + print(f'response0: {resp_list[0].choices[0].message.content}') + print(f'query1: {query1}') + print(f'response1: {resp_list[1].choices[0].message.content}') + + +if __name__ == '__main__': + # This is an example of BERT with LoRA. + from swift.llm import InferEngine, InferRequest, PtEngine, load_dataset, safe_snapshot_download, BaseArguments + from swift.tuners import Swift + adapter_path = safe_snapshot_download('swift/test_bert') + args = BaseArguments.from_pretrained(adapter_path) + args.max_length = 512 + args.truncation_strategy = 'right' + # method1 + model, processor = args.get_model_processor() + model = Swift.from_pretrained(model, adapter_path) + template = args.get_template(processor) + engine = PtEngine.from_model_template(model, template, max_batch_size=64) + + # method2 + # engine = PtEngine(args.model, adapters=[adapter_path], max_batch_size=64, + # task_type=args.task_type, num_labels=args.num_labels) + # template = args.get_template(engine.processor) + # engine.default_template = template + + # Here, `load_dataset` is used for convenience; `infer_batch` does not require creating a dataset. + dataset = load_dataset(['DAMO_NLP/jd:cls#1000'], seed=42)[0] + print(f'dataset: {dataset}') + infer_requests = [InferRequest(messages=data['messages']) for data in dataset] + infer_batch(engine, infer_requests) + + infer_batch(engine, [ + InferRequest(messages=[{ + 'role': 'user', + 'content': '今天天气真好呀' + }]), + InferRequest(messages=[{ + 'role': 'user', + 'content': '真倒霉' + }]) + ]) diff --git a/examples/infer/demo_grounding.py b/examples/infer/demo_grounding.py new file mode 100644 index 0000000000000000000000000000000000000000..6f20fd8294a3d7515e9f3e349f775f8b044a5d04 --- /dev/null +++ b/examples/infer/demo_grounding.py @@ -0,0 +1,43 @@ +# pip install git+https://github.com/huggingface/transformers.git # transformers>=4.49 +import os +import re +from typing import Literal + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def draw_bbox_qwen2_vl(image, response, norm_bbox: Literal['norm1000', 'none']): + matches = re.findall( + r'<\|object_ref_start\|>(.*?)<\|object_ref_end\|><\|box_start\|>\((\d+),(\d+)\),\((\d+),(\d+)\)<\|box_end\|>', + response) + ref = [] + bbox = [] + for match_ in matches: + ref.append(match_[0]) + bbox.append(list(match_[1:])) + draw_bbox(image, ref, bbox, norm_bbox=norm_bbox) + + +def infer_grounding(): + from swift.llm import PtEngine, RequestConfig, BaseArguments, InferRequest, safe_snapshot_download + output_path = 'bbox.png' + image = load_image('http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png') + infer_request = InferRequest(messages=[{'role': 'user', 'content': 'Task: Object Detection'}], images=[image]) + + request_config = RequestConfig(max_tokens=512, temperature=0) + adapter_path = safe_snapshot_download('swift/test_grounding') + args = BaseArguments.from_pretrained(adapter_path) + + engine = PtEngine(args.model, adapters=[adapter_path]) + resp_list = engine.infer([infer_request], request_config) + response = resp_list[0].choices[0].message.content + print(f'lora-response: {response}') + + draw_bbox_qwen2_vl(image, response, norm_bbox=args.norm_bbox) + print(f'output_path: {output_path}') + image.save(output_path) + + +if __name__ == '__main__': + from swift.llm import draw_bbox, load_image + infer_grounding() diff --git a/examples/infer/demo_hf.py b/examples/infer/demo_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f2114a279b41affd6137bb2607f1740670f566 --- /dev/null +++ b/examples/infer/demo_hf.py @@ -0,0 +1,61 @@ +def infer_hf(): + from transformers import AutoModelForCausalLM, AutoTokenizer + from peft import PeftModel + from modelscope import snapshot_download + model_dir = snapshot_download('Qwen/Qwen2.5-7B-Instruct') + adapter_dir = snapshot_download('swift/test_lora') + model = AutoModelForCausalLM.from_pretrained( + model_dir, torch_dtype='auto', device_map='auto', trust_remote_code=True) + model = PeftModel.from_pretrained(model, adapter_dir) + + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) + + messages = [{ + 'role': 'system', + 'content': 'You are a helpful assistant.' + }, { + 'role': 'user', + 'content': 'who are you?' + }] + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + model_inputs = tokenizer([text], return_tensors='pt', add_special_tokens=False).to(model.device) + + generated_ids = model.generate(**model_inputs, max_new_tokens=512, do_sample=False) + generated_ids = [ + output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + ] + + response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + print(f'response: {response}') + return response + + +def infer_swift(): + from swift.llm import get_model_tokenizer, get_template, InferRequest, RequestConfig, PtEngine + from modelscope import snapshot_download + from swift.tuners import Swift + model_dir = snapshot_download('Qwen/Qwen2.5-7B-Instruct') + adapter_dir = snapshot_download('swift/test_lora') + model, tokenizer = get_model_tokenizer(model_dir, device_map='auto') + model = Swift.from_pretrained(model, adapter_dir) + template = get_template(model.model_meta.template, tokenizer) + engine = PtEngine.from_model_template(model, template) + + messages = [{ + 'role': 'system', + 'content': 'You are a helpful assistant.' + }, { + 'role': 'user', + 'content': 'who are you?' + }] + request_config = RequestConfig(max_tokens=512, temperature=0) + resp_list = engine.infer([InferRequest(messages=messages)], request_config=request_config) + response = resp_list[0].choices[0].message.content + print(f'response: {response}') + return response + + +if __name__ == '__main__': + response = infer_hf() + response2 = infer_swift() + assert response == response2 diff --git a/examples/infer/demo_lora.py b/examples/infer/demo_lora.py new file mode 100644 index 0000000000000000000000000000000000000000..8d9396f135cca27c89b09636b5c5ffcc749a2335 --- /dev/null +++ b/examples/infer/demo_lora.py @@ -0,0 +1,68 @@ +import os +from typing import Literal + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def infer_multilora(infer_request: 'InferRequest', infer_backend: Literal['vllm', 'pt']): + # Dynamic LoRA + adapter_path = safe_snapshot_download('swift/test_lora') + adapter_path2 = safe_snapshot_download('swift/test_lora2') + args = BaseArguments.from_pretrained(adapter_path) + if infer_backend == 'pt': + engine = PtEngine(args.model) + elif infer_backend == 'vllm': + from swift.llm import VllmEngine + engine = VllmEngine(args.model, enable_lora=True, max_loras=1, max_lora_rank=16) + template = get_template(args.template, engine.processor, args.system) + request_config = RequestConfig(max_tokens=512, temperature=0) + adapter_request = AdapterRequest('lora1', adapter_path) + adapter_request2 = AdapterRequest('lora2', adapter_path2) + + # use lora + resp_list = engine.infer([infer_request], request_config, template=template, adapter_request=adapter_request) + response = resp_list[0].choices[0].message.content + print(f'lora1-response: {response}') + # origin model + resp_list = engine.infer([infer_request], request_config) + response = resp_list[0].choices[0].message.content + print(f'response: {response}') + # use lora + resp_list = engine.infer([infer_request], request_config, template=template, adapter_request=adapter_request2) + response = resp_list[0].choices[0].message.content + print(f'lora2-response: {response}') + + +def infer_lora(infer_request: 'InferRequest'): + request_config = RequestConfig(max_tokens=512, temperature=0) + adapter_path = safe_snapshot_download('swift/test_lora') + args = BaseArguments.from_pretrained(adapter_path) + # method1 + # engine = PtEngine(args.model, adapters=[adapter_path]) + # template = get_template(args.template, engine.tokenizer, args.system) + # engine.default_template = template + + # method2 + # model, processor = args.get_model_processor() + # model = Swift.from_pretrained(model, adapter_path) + # template = args.get_template(processor) + # engine = PtEngine.from_model_template(model, template) + + # method3 + model, tokenizer = get_model_tokenizer(args.model) + model = Swift.from_pretrained(model, adapter_path) + template = get_template(args.template, tokenizer, args.system) + engine = PtEngine.from_model_template(model, template) + + resp_list = engine.infer([infer_request], request_config) + response = resp_list[0].choices[0].message.content + print(f'lora-response: {response}') + + +if __name__ == '__main__': + from swift.llm import (PtEngine, RequestConfig, AdapterRequest, get_template, BaseArguments, InferRequest, + safe_snapshot_download, get_model_tokenizer) + from swift.tuners import Swift + infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}]) + # infer_lora(infer_request) + infer_multilora(infer_request, 'pt') diff --git a/examples/infer/demo_mllm.py b/examples/infer/demo_mllm.py new file mode 100644 index 0000000000000000000000000000000000000000..5fca560e44853c715a3f64c42fbb3ce87aceb225 --- /dev/null +++ b/examples/infer/demo_mllm.py @@ -0,0 +1,145 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +from typing import List, Literal + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + + +def infer_batch(engine: 'InferEngine', infer_requests: List['InferRequest']): + request_config = RequestConfig(max_tokens=512, temperature=0) + metric = InferStats() + resp_list = engine.infer(infer_requests, request_config, metrics=[metric]) + query0 = infer_requests[0].messages[0]['content'] + print(f'query0: {query0}') + print(f'response0: {resp_list[0].choices[0].message.content}') + print(f'metric: {metric.compute()}') + # metric.reset() # reuse + + +def infer_stream(engine: 'InferEngine', infer_request: 'InferRequest'): + request_config = RequestConfig(max_tokens=512, temperature=0, stream=True) + metric = InferStats() + gen_list = engine.infer([infer_request], request_config, metrics=[metric]) + query = infer_request.messages[0]['content'] + print(f'query: {query}\nresponse: ', end='') + for resp in gen_list[0]: + if resp is None: + continue + print(resp.choices[0].delta.content, end='', flush=True) + print() + print(f'metric: {metric.compute()}') + + +def get_message(mm_type: Literal['text', 'image', 'video', 'audio']): + if mm_type == 'text': + message = {'role': 'user', 'content': 'who are you?'} + elif mm_type == 'image': + message = { + 'role': + 'user', + 'content': [ + { + 'type': 'image', + # url or local_path or PIL.Image or base64 + 'image': 'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png' + }, + { + 'type': 'text', + 'text': 'How many sheep are there in the picture?' + } + ] + } + + elif mm_type == 'video': + message = { + 'role': + 'user', + 'content': [{ + 'type': 'video', + 'video': 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4' + }, { + 'type': 'text', + 'text': 'Describe this video.' + }] + } + elif mm_type == 'audio': + message = { + 'role': + 'user', + 'content': [{ + 'type': 'audio', + 'audio': 'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/weather.wav' + }, { + 'type': 'text', + 'text': 'What does this audio say?' + }] + } + return message + + +def get_data(mm_type: Literal['text', 'image', 'video', 'audio']): + data = {} + if mm_type == 'text': + messages = [{'role': 'user', 'content': 'who are you?'}] + elif mm_type == 'image': + # The number of tags must be the same as len(images). + messages = [{'role': 'user', 'content': 'How many sheep are there in the picture?'}] + # Support URL/Path/base64/PIL.Image + data['images'] = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png'] + elif mm_type == 'video': + messages = [{'role': 'user', 'content': '