| | import os |
| |
|
| | os.environ['CUDA_VISIBLE_DEVICES'] = '0' |
| |
|
| |
|
| | def _infer_audio(model, use_chat_template: bool = True, max_model_len=8192, system=None): |
| | engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'audio': 2}) |
| | if not use_chat_template: |
| | engine.default_template.use_chat_template = False |
| | audios = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/weather.wav'] |
| | messages = [] |
| | if system is not None: |
| | messages += [{'role': 'system', 'content': system}] |
| | messages.append({'role': 'user', 'content': 'describe the audio.'}) |
| | resp_list = engine.infer([InferRequest(messages=messages, audios=audios)], |
| | RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.)) |
| | return resp_list[0].choices[0].message.content |
| |
|
| |
|
| | def _infer_image(model, use_chat_template: bool = True, max_model_len=8192, system=None): |
| | engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'image': 5, 'video': 2}) |
| | if not use_chat_template: |
| | engine.default_template.use_chat_template = False |
| | images = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png'] |
| | messages = [] |
| | if system is not None: |
| | messages += [{'role': 'system', 'content': system}] |
| | messages.append({'role': 'user', 'content': 'describe the image.'}) |
| | resp_list = engine.infer([InferRequest(messages=messages, images=images)], |
| | RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.)) |
| | return resp_list[0].choices[0].message.content |
| |
|
| |
|
| | def _infer_video(model, use_chat_template: bool = True, max_model_len=8192, system=None, limit_mm_per_prompt=None): |
| | limit_mm_per_prompt = limit_mm_per_prompt or {'image': 16, 'video': 2} |
| | engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt=limit_mm_per_prompt) |
| | if not use_chat_template: |
| | engine.default_template.use_chat_template = False |
| | videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4'] |
| | messages = [] |
| | if system is not None: |
| | messages += [{'role': 'system', 'content': system}] |
| | messages.append({'role': 'user', 'content': 'describe the video.'}) |
| | resp_list = engine.infer([InferRequest(messages=messages, videos=videos)], |
| | RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.)) |
| | return resp_list[0].choices[0].message.content |
| |
|
| |
|
| | def test_qwen2_audio(): |
| | response = _infer_audio('Qwen/Qwen2-Audio-7B-Instruct') |
| | assert response == "The audio is a man speaking in Mandarin saying '今天天气真好呀'." |
| |
|
| |
|
| | def test_qwen2_vl(): |
| | response = _infer_image('Qwen/Qwen2-VL-2B-Instruct') |
| | assert response == ( |
| | 'The image depicts a cute kitten with a fluffy, white and gray striped coat. The kitten has large, ' |
| | 'expressive blue eyes and is looking directly at the camera. Its ears are perked up, and it has a ' |
| | 'small red mark on its left ear. The background is blurred, focusing attention on the kitten. The overall') |
| |
|
| |
|
| | def test_qwen2_5_vl(): |
| | response = _infer_image('Qwen/Qwen2.5-VL-3B-Instruct') |
| | assert response == ( |
| | 'The image depicts a cute, fluffy kitten with striking blue eyes and a white and gray fur pattern. ' |
| | 'The kitten has a small, pink nose and is looking directly at the camera with a curious expression. ' |
| | "The background is blurred, drawing attention to the kitten's face. " |
| | 'The overall appearance is very endearing and charming.') |
| |
|
| |
|
| | def test_deepseek_vl_v2(): |
| | response = _infer_image('deepseek-ai/deepseek-vl2-tiny', max_model_len=4096) |
| | assert response == ('The image depicts a close-up of a adorable kitten with large, expressive eyes. The kitten has ' |
| | 'a mix of white and gray fur with distinct black stripes, giving it a tabby-like appearance. ' |
| | 'Its ears are perked up, and its whiskers are prominently visible. The background is blurred, ' |
| | 'focusing attention on the kitten') |
| |
|
| |
|
| | def test_internvl2(): |
| | response = _infer_image('OpenGVLab/InternVL2-2B', max_model_len=4096, system='') |
| | assert response == ('The image features a kitten with striking blue eyes and a mix of white and black fur. ' |
| | 'The kitten has large, expressive eyes and a small, pink nose. Its ears are perked up, ' |
| | 'and it appears to be looking directly at the camera. The fur is soft and fluffy, with a mix') |
| |
|
| |
|
| | def test_minicpmv_2_5(): |
| | response = _infer_image('OpenBMB/MiniCPM-Llama3-V-2_5', max_model_len=4096) |
| | assert response == ( |
| | "The image is a digital painting of a kitten that captures the essence of a young feline's innocence " |
| | "and curiosity. The kitten's fur is rendered with a mix of gray, white, and black stripes, " |
| | 'giving it a realistic and adorable appearance. Its large, expressive eyes are a striking blue, ' |
| | "which draws the viewer's") |
| |
|
| |
|
| | def test_minicpmv_2_6(): |
| | response = _infer_image('OpenBMB/MiniCPM-V-2_6', max_model_len=4096) |
| | assert response == ( |
| | 'The image features a close-up of a kitten with striking blue eyes and a mix of ' |
| | "white and dark fur, possibly gray or black. The kitten's gaze is directed forward, giving it an " |
| | "expressive and captivating look. The background is blurred, drawing focus to the kitten's face. " |
| | "The overall composition emphasizes the kitten's features") |
| |
|
| |
|
| | def test_minicpmo_2_6_video(): |
| | response = _infer_video('OpenBMB/MiniCPM-o-2_6') |
| | assert response == ('The video features a young child sitting on a bed, deeply engaged in reading a book. ' |
| | 'The child, dressed in a light blue sleeveless top and pink pants, is surrounded by a ' |
| | 'cozy and homely environment. The bed is adorned with a patterned blanket, and a white cloth ' |
| | 'is casually draped over the side.') |
| |
|
| |
|
| | def test_qwen2_5_vl_video(): |
| | response = _infer_video('Qwen/Qwen2.5-VL-3B-Instruct') |
| | assert response == ('A baby wearing sunglasses is sitting on a bed and reading a book. ' |
| | 'The baby is holding the book with both hands and is looking at the pages. ' |
| | 'The baby is wearing a light blue shirt and pink pants. The baby is sitting ' |
| | 'on a white blanket. The baby is looking at the book and is smiling. The baby') |
| |
|
| |
|
| | def test_qwen2_5_omni(): |
| | limit_mm_per_prompt = {'image': 1, 'video': 1, 'audio': 1} |
| | response = _infer_video('Qwen/Qwen2.5-Omni-7B', limit_mm_per_prompt=limit_mm_per_prompt) |
| | assert response |
| |
|
| |
|
| | if __name__ == '__main__': |
| | from swift.llm import VllmEngine, InferRequest, RequestConfig |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | test_qwen2_5_omni() |
| |
|