from transformers import AutoModel, AutoTokenizer import torch from modeling_videochat_flash import VideoChatFlashQwenForCausalLM # model setting model_path = './' tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = VideoChatFlashQwenForCausalLM.from_pretrained(model_path).to(torch.bfloat16).cuda() image_processor = model.get_vision_tower().image_processor mm_llm_compress = False # use the global compress or not if mm_llm_compress: model.config.mm_llm_compress = True model.config.llm_compress_type = "uniform0_attention" model.config.llm_compress_layer_list = [4, 18] model.config.llm_image_token_ratio_list = [1, 0.75, 0.25] else: model.config.mm_llm_compress = False # evaluation setting max_num_frames = 512 generation_config = dict( do_sample=False, temperature=0.0, max_new_tokens=1024, top_p=0.1, num_beams=1 ) video_path = "test.mp4" # single-turn conversation question1 = "Describe this video in detail." output1, chat_history = model.chat(video_path=video_path, tokenizer=tokenizer, user_prompt=question1, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config) print(output1) # # multi-turn conversation # question2 = "How many people appear in the video?" # output2, chat_history = model.chat(video_path=video_path, tokenizer=tokenizer, user_prompt=question2, chat_history=chat_history, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config) # print(output2)