pbwpbw
/

ReWatch-R1-SFT

Model card Files Files and versions

ReWatch-R1-SFT / README.md

pbwpbw's picture

Upload folder using huggingface_hub

bc425b8 verified 3 days ago

|

history blame contribute delete

3.39 kB

	# ReWatch-R1-SFT
	Our ReWatch-R1 project page: https://rewatch-r1.github.io/
	## Using ReWatch-R1 to Inference
	Use our model for video reasoning! Please use transformers==4.56.0 and qwen_vl_utils. \
	It is recommended to use the video parameters in the paper (up to 192 frames, with a resolution of 128\28\28 per frame). \
	For the best results, you must provide the duration of the video in the prompt (for example, 00:00-10:00), and the timestamp should be in the MM\:SS format.
	```python
	import torch
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info

	model_path = "ReWatch-R1-SFT"

	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	attn_implementation="flash_attention_2",
	)

	processor = AutoProcessor.from_pretrained(
	model_path,
	trust_remote_code=True,
	use_fast=True,
	padding_side="left",
	truncation_side="right",
	)

	video_path = "videos/example.mp4"
	video_duration = 600
	question = "What happened from [05:00] to [05:10]?"

	total_pixels = 122882828
	min_pixels = 1282828
	max_pixels = 1282828
	fps = 2.0
	max_frames = 192

	video_config = {
	"type": "video",
	"video": video_path,
	"total_pixels": total_pixels,
	"min_pixels": min_pixels,
	"max_pixels": max_pixels,
	"fps": fps,
	"max_frames": max_frames
	}

	react_prompt = """You are a video understanding expert. You are given a video and a question. You need to answer the question based on the video content. Please answer the question step by step. When you need more video details, you will re-watch the relevant clips and use <action> and </action> to mark the actions, and use <observation> and </observation> to mark the visual details you observe. When you have enough information to determine the final answer, you will wrap the final answer in <answer> and </answer>.

	Video Information and Question:
	- Video Duration: {video_duration}
	- Question: {question}"""

	def seconds_to_timestamp(seconds):
	"""将秒数转换为时间戳字符串 (MM:SS)"""
	minutes = seconds // 60
	seconds = seconds % 60
	return f"{minutes:02d}:{seconds:02d}"

	duration_str = f"00:00-{seconds_to_timestamp(video_duration)}"
	instruction = react_prompt.format(video_duration=duration_str, question=question)

	messages = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": [
	video_config,
	{"type": "text", "text": instruction},
	]},
	]

	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)

	image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	max_length=16384,
	truncation=True,
	do_sample_frames=False,
	**video_kwargs,
	)
	inputs = inputs.to("cuda")

	generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=4096, use_cache=True)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)
	print(output_text)
	```