ReWatch-R1-SFT

Our ReWatch-R1 project page: https://rewatch-r1.github.io/

Using ReWatch-R1 to Inference

Use our model for video reasoning! Please use transformers==4.56.0 and qwen_vl_utils.
It is recommended to use the video parameters in the paper (up to 192 frames, with a resolution of 128*28*28 per frame).
For the best results, you must provide the duration of the video in the prompt (for example, 00:00-10:00), and the timestamp should be in the MM:SS format.

import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

model_path = "ReWatch-R1-SFT"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
)

processor = AutoProcessor.from_pretrained(
    model_path, 
    trust_remote_code=True,
    use_fast=True,
    padding_side="left",
    truncation_side="right",
)

video_path = "videos/example.mp4"
video_duration = 600
question = "What happened from [05:00] to [05:10]?"

total_pixels = 12288*28*28
min_pixels = 128*28*28
max_pixels = 128*28*28
fps = 2.0
max_frames = 192

video_config = {
    "type": "video",
    "video": video_path,
    "total_pixels": total_pixels,
    "min_pixels": min_pixels,
    "max_pixels": max_pixels,
    "fps": fps,
    "max_frames": max_frames
}

react_prompt = """You are a video understanding expert. You are given a video and a question. You need to answer the question based on the video content. Please answer the question step by step. When you need more video details, you will re-watch the relevant clips and use <action> and </action> to mark the actions, and use <observation> and </observation> to mark the visual details you observe. When you have enough information to determine the final answer, you will wrap the final answer in <answer> and </answer>.

**Video Information and Question:**
- **Video Duration**: {video_duration}
- **Question**: {question}"""

def seconds_to_timestamp(seconds):
    """将秒数转换为时间戳字符串 (MM:SS)"""
    minutes = seconds // 60
    seconds = seconds % 60
    return f"{minutes:02d}:{seconds:02d}"

duration_str = f"00:00-{seconds_to_timestamp(video_duration)}"
instruction = react_prompt.format(video_duration=duration_str, question=question)

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": [
        video_config,
        {"type": "text", "text": instruction},
    ]},
]

text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
    max_length=16384,
    truncation=True,
    do_sample_frames=False,
    **video_kwargs,
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=4096, use_cache=True)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

Downloads last month: 1

Safetensors

Model size

8B params

Tensor type

BF16

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support