File size: 7,062 Bytes
7feac49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
def _infer_audio(model, use_chat_template: bool = True, max_model_len=8192, system=None):
engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'audio': 2})
if not use_chat_template:
engine.default_template.use_chat_template = False
audios = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/weather.wav']
messages = []
if system is not None:
messages += [{'role': 'system', 'content': system}]
messages.append({'role': 'user', 'content': 'describe the audio.'})
resp_list = engine.infer([InferRequest(messages=messages, audios=audios)],
RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.))
return resp_list[0].choices[0].message.content
def _infer_image(model, use_chat_template: bool = True, max_model_len=8192, system=None):
engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt={'image': 5, 'video': 2})
if not use_chat_template:
engine.default_template.use_chat_template = False
images = ['http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png']
messages = []
if system is not None:
messages += [{'role': 'system', 'content': system}]
messages.append({'role': 'user', 'content': 'describe the image.'})
resp_list = engine.infer([InferRequest(messages=messages, images=images)],
RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.))
return resp_list[0].choices[0].message.content
def _infer_video(model, use_chat_template: bool = True, max_model_len=8192, system=None, limit_mm_per_prompt=None):
limit_mm_per_prompt = limit_mm_per_prompt or {'image': 16, 'video': 2}
engine = VllmEngine(model, max_model_len=max_model_len, limit_mm_per_prompt=limit_mm_per_prompt)
if not use_chat_template:
engine.default_template.use_chat_template = False
videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
messages = []
if system is not None:
messages += [{'role': 'system', 'content': system}]
messages.append({'role': 'user', 'content': 'describe the video.'})
resp_list = engine.infer([InferRequest(messages=messages, videos=videos)],
RequestConfig(temperature=0, max_tokens=64, repetition_penalty=1.))
return resp_list[0].choices[0].message.content
def test_qwen2_audio():
response = _infer_audio('Qwen/Qwen2-Audio-7B-Instruct')
assert response == "The audio is a man speaking in Mandarin saying '今天天气真好呀'."
def test_qwen2_vl():
response = _infer_image('Qwen/Qwen2-VL-2B-Instruct')
assert response == (
'The image depicts a cute kitten with a fluffy, white and gray striped coat. The kitten has large, '
'expressive blue eyes and is looking directly at the camera. Its ears are perked up, and it has a '
'small red mark on its left ear. The background is blurred, focusing attention on the kitten. The overall')
def test_qwen2_5_vl():
response = _infer_image('Qwen/Qwen2.5-VL-3B-Instruct')
assert response == (
'The image depicts a cute, fluffy kitten with striking blue eyes and a white and gray fur pattern. '
'The kitten has a small, pink nose and is looking directly at the camera with a curious expression. '
"The background is blurred, drawing attention to the kitten's face. "
'The overall appearance is very endearing and charming.')
def test_deepseek_vl_v2():
response = _infer_image('deepseek-ai/deepseek-vl2-tiny', max_model_len=4096)
assert response == ('The image depicts a close-up of a adorable kitten with large, expressive eyes. The kitten has '
'a mix of white and gray fur with distinct black stripes, giving it a tabby-like appearance. '
'Its ears are perked up, and its whiskers are prominently visible. The background is blurred, '
'focusing attention on the kitten')
def test_internvl2():
response = _infer_image('OpenGVLab/InternVL2-2B', max_model_len=4096, system='')
assert response == ('The image features a kitten with striking blue eyes and a mix of white and black fur. '
'The kitten has large, expressive eyes and a small, pink nose. Its ears are perked up, '
'and it appears to be looking directly at the camera. The fur is soft and fluffy, with a mix')
def test_minicpmv_2_5():
response = _infer_image('OpenBMB/MiniCPM-Llama3-V-2_5', max_model_len=4096)
assert response == (
"The image is a digital painting of a kitten that captures the essence of a young feline's innocence "
"and curiosity. The kitten's fur is rendered with a mix of gray, white, and black stripes, "
'giving it a realistic and adorable appearance. Its large, expressive eyes are a striking blue, '
"which draws the viewer's")
def test_minicpmv_2_6():
response = _infer_image('OpenBMB/MiniCPM-V-2_6', max_model_len=4096)
assert response == (
'The image features a close-up of a kitten with striking blue eyes and a mix of '
"white and dark fur, possibly gray or black. The kitten's gaze is directed forward, giving it an "
"expressive and captivating look. The background is blurred, drawing focus to the kitten's face. "
"The overall composition emphasizes the kitten's features")
def test_minicpmo_2_6_video():
response = _infer_video('OpenBMB/MiniCPM-o-2_6')
assert response == ('The video features a young child sitting on a bed, deeply engaged in reading a book. '
'The child, dressed in a light blue sleeveless top and pink pants, is surrounded by a '
'cozy and homely environment. The bed is adorned with a patterned blanket, and a white cloth '
'is casually draped over the side.')
def test_qwen2_5_vl_video():
response = _infer_video('Qwen/Qwen2.5-VL-3B-Instruct')
assert response == ('A baby wearing sunglasses is sitting on a bed and reading a book. '
'The baby is holding the book with both hands and is looking at the pages. '
'The baby is wearing a light blue shirt and pink pants. The baby is sitting '
'on a white blanket. The baby is looking at the book and is smiling. The baby')
def test_qwen2_5_omni():
limit_mm_per_prompt = {'image': 1, 'video': 1, 'audio': 1}
response = _infer_video('Qwen/Qwen2.5-Omni-7B', limit_mm_per_prompt=limit_mm_per_prompt)
assert response
if __name__ == '__main__':
from swift.llm import VllmEngine, InferRequest, RequestConfig
# test_qwen2_vl()
# test_qwen2_5_vl()
# test_deepseek_vl_v2()
# test_internvl2()
# test_qwen2_audio()
# test_minicpmv_2_5()
# test_minicpmv_2_6()
# test_minicpmo_2_6_video()
# test_qwen2_5_vl_video()
test_qwen2_5_omni()
|