File size: 1,525 Bytes
27ba5c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from transformers import AutoModel, AutoTokenizer
import torch
from modeling_videochat_flash import VideoChatFlashQwenForCausalLM
# model setting
model_path = './'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = VideoChatFlashQwenForCausalLM.from_pretrained(model_path).to(torch.bfloat16).cuda()
image_processor = model.get_vision_tower().image_processor
mm_llm_compress = False # use the global compress or not
if mm_llm_compress:
model.config.mm_llm_compress = True
model.config.llm_compress_type = "uniform0_attention"
model.config.llm_compress_layer_list = [4, 18]
model.config.llm_image_token_ratio_list = [1, 0.75, 0.25]
else:
model.config.mm_llm_compress = False
# evaluation setting
max_num_frames = 512
generation_config = dict(
do_sample=False,
temperature=0.0,
max_new_tokens=1024,
top_p=0.1,
num_beams=1
)
video_path = "test.mp4"
# single-turn conversation
question1 = "Describe this video in detail."
output1, chat_history = model.chat(video_path=video_path, tokenizer=tokenizer, user_prompt=question1, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config)
print(output1)
# # multi-turn conversation
# question2 = "How many people appear in the video?"
# output2, chat_history = model.chat(video_path=video_path, tokenizer=tokenizer, user_prompt=question2, chat_history=chat_history, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config)
# print(output2)
|