| # ReWatch-R1-SFT | |
| Our ReWatch-R1 project page: https://rewatch-r1.github.io/ | |
| ## Using ReWatch-R1 to Inference | |
| Use our model for video reasoning! Please use transformers==4.56.0 and qwen_vl_utils. \ | |
| It is recommended to use the video parameters in the paper (up to 192 frames, with a resolution of 128\*28\*28 per frame). \ | |
| For the best results, you must provide the duration of the video in the prompt (for example, 00:00-10:00), and the timestamp should be in the MM\:SS format. | |
| ```python | |
| import torch | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor | |
| from qwen_vl_utils import process_vision_info | |
| model_path = "ReWatch-R1-SFT" | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| model_path, | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True, | |
| attn_implementation="flash_attention_2", | |
| ) | |
| processor = AutoProcessor.from_pretrained( | |
| model_path, | |
| trust_remote_code=True, | |
| use_fast=True, | |
| padding_side="left", | |
| truncation_side="right", | |
| ) | |
| video_path = "videos/example.mp4" | |
| video_duration = 600 | |
| question = "What happened from [05:00] to [05:10]?" | |
| total_pixels = 12288*28*28 | |
| min_pixels = 128*28*28 | |
| max_pixels = 128*28*28 | |
| fps = 2.0 | |
| max_frames = 192 | |
| video_config = { | |
| "type": "video", | |
| "video": video_path, | |
| "total_pixels": total_pixels, | |
| "min_pixels": min_pixels, | |
| "max_pixels": max_pixels, | |
| "fps": fps, | |
| "max_frames": max_frames | |
| } | |
| react_prompt = """You are a video understanding expert. You are given a video and a question. You need to answer the question based on the video content. Please answer the question step by step. When you need more video details, you will re-watch the relevant clips and use <action> and </action> to mark the actions, and use <observation> and </observation> to mark the visual details you observe. When you have enough information to determine the final answer, you will wrap the final answer in <answer> and </answer>. | |
| **Video Information and Question:** | |
| - **Video Duration**: {video_duration} | |
| - **Question**: {question}""" | |
| def seconds_to_timestamp(seconds): | |
| """将秒数转换为时间戳字符串 (MM:SS)""" | |
| minutes = seconds // 60 | |
| seconds = seconds % 60 | |
| return f"{minutes:02d}:{seconds:02d}" | |
| duration_str = f"00:00-{seconds_to_timestamp(video_duration)}" | |
| instruction = react_prompt.format(video_duration=duration_str, question=question) | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": [ | |
| video_config, | |
| {"type": "text", "text": instruction}, | |
| ]}, | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True) | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| max_length=16384, | |
| truncation=True, | |
| do_sample_frames=False, | |
| **video_kwargs, | |
| ) | |
| inputs = inputs.to("cuda") | |
| generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=4096, use_cache=True) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| print(output_text) | |
| ``` |