from transformers import Qwen3VLForConditionalGeneration, AutoProcessor from my_vision_process import process_vision_info import torch import re import ast # Note: The model class has been updated to Qwen3VLForConditionalGeneration for consistency # with the main application and the latest transformers library conventions for this model. model_path = "OpenGVLab/VideoChat-R1_5" # default: Load the model on the available device(s) model = Qwen3VLForConditionalGeneration.from_pretrained( model_path, torch_dtype="auto", device_map="auto", attn_implementation="flash_attention_2" ).eval() # default processer processor = AutoProcessor.from_pretrained(model_path) video_path = "your_video.mp4" question = "your_qa.mp4" num_percptions = 3 QA_THINK_GLUE = """Answer the question: "[QUESTION]" according to the content of the video. Output your think process within the tags. Then, provide your answer within the tags, output the corresponding letter of the option. At the same time, in the tags, present the precise time period in seconds of the video clips on which you base your answer to this question in the format of [(s1, e1), (s2, e2), ...]. For example: ...A[(5.2, 10.4)]. """ QA_THINK = """Answer the question: "[QUESTION]" according to the content of the video. Output your think process within the tags. Then, provide your answer within the tags, output the corresponding letter of the option. For example: ...A[(5.2, 10.4)]. """ def inference(video_path, prompt, model, processor, max_new_tokens=2048, client=None, pred_glue=None): device = model.device messages = [ {"role": "user", "content": [ {"type": "video", "video": video_path, 'key_time':pred_glue, "total_pixels": 128*12 * 28 * 28, "min_pixels": 128 * 28 * 28, }, {"type": "text", "text": prompt}, ] }, ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, client=client) fps_inputs = video_kwargs['fps'][0] inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True) generated_ids = [output_ids[i][len(inputs['input_ids'][i]):] for i in range(len(output_ids))] output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) return output_text[0] # This is example usage code. You should replace the placeholders. # For example: # item = {"problem": {"question": "What is the person doing in the video?"}} # client = None # Or initialize your client # pred_glue = None # answers = [] # for percption in range(num_percptions): # if percption == num_percptions - 1: # example_prompt = QA_THINK.replace("[QUESTION]", item["problem"]["question"]) # else: # example_prompt = QA_THINK_GLUE.replace("[QUESTION]", item["problem"]["question"]) # ans = inference(video_path, example_prompt, model, processor, client=client, pred_glue=pred_glue) # pattern_glue = r'(.*?)' # match_glue = re.search(pattern_glue, ans, re.DOTALL) # answers.append(ans) # pred_glue = None # try: # if match_glue: # glue = match_glue.group(1) # pred_glue = ast.literal_eval(glue) # except Exception as e: # pred_glue = None # print(ans)