from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from my_vision_process import process_vision_info
import torch
import re
import ast
# Note: The model class has been updated to Qwen3VLForConditionalGeneration for consistency
# with the main application and the latest transformers library conventions for this model.
model_path = "OpenGVLab/VideoChat-R1_5"
# default: Load the model on the available device(s)
model = Qwen3VLForConditionalGeneration.from_pretrained(
model_path, torch_dtype="auto", device_map="auto",
attn_implementation="flash_attention_2"
).eval()
# default processer
processor = AutoProcessor.from_pretrained(model_path)
video_path = "your_video.mp4"
question = "your_qa.mp4"
num_percptions = 3
QA_THINK_GLUE = """Answer the question: "[QUESTION]" according to the content of the video.
Output your think process within the tags.
Then, provide your answer within the tags, output the corresponding letter of the option. At the same time, in the tags, present the precise time period in seconds of the video clips on which you base your answer to this question in the format of [(s1, e1), (s2, e2), ...]. For example: ...A[(5.2, 10.4)].
"""
QA_THINK = """Answer the question: "[QUESTION]" according to the content of the video.
Output your think process within the tags.
Then, provide your answer within the tags, output the corresponding letter of the option. For example: ...A[(5.2, 10.4)].
"""
def inference(video_path, prompt, model, processor, max_new_tokens=2048, client=None, pred_glue=None):
device = model.device
messages = [
{"role": "user", "content": [
{"type": "video",
"video": video_path,
'key_time':pred_glue,
"total_pixels": 128*12 * 28 * 28,
"min_pixels": 128 * 28 * 28,
},
{"type": "text", "text": prompt},
]
},
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, client=client)
fps_inputs = video_kwargs['fps'][0]
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
generated_ids = [output_ids[i][len(inputs['input_ids'][i]):] for i in range(len(output_ids))]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
return output_text[0]
# This is example usage code. You should replace the placeholders.
# For example:
# item = {"problem": {"question": "What is the person doing in the video?"}}
# client = None # Or initialize your client
# pred_glue = None
# answers = []
# for percption in range(num_percptions):
# if percption == num_percptions - 1:
# example_prompt = QA_THINK.replace("[QUESTION]", item["problem"]["question"])
# else:
# example_prompt = QA_THINK_GLUE.replace("[QUESTION]", item["problem"]["question"])
# ans = inference(video_path, example_prompt, model, processor, client=client, pred_glue=pred_glue)
# pattern_glue = r'(.*?)'
# match_glue = re.search(pattern_glue, ans, re.DOTALL)
# answers.append(ans)
# pred_glue = None
# try:
# if match_glue:
# glue = match_glue.group(1)
# pred_glue = ast.literal_eval(glue)
# except Exception as e:
# pred_glue = None
# print(ans)