Reward-Forcing

Paused

App Files Files Community

Reward-Forcing / videoalign /wan_inference.py

fffiloni

Migrated from GitHub

34c9227 verified 29 days ago

raw

history blame contribute delete

14.8 kB

	import ast
	import json
	import os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	import pdb
	from collections.abc import Mapping
	import pandas as pd

	import torch
	from videoalign.vision_process import process_vision_info, process_wanvideo_tensor

	from videoalign.data import DataConfig
	from videoalign.utils import ModelConfig, PEFTLoraConfig, TrainingConfig
	from videoalign.utils import load_model_from_checkpoint
	from videoalign.train_reward import create_model_and_processor
	from videoalign.prompt_template import build_prompt

	import numpy as np
	from PIL import Image
	import imageio

	def load_configs_from_json(config_path):
	with open(config_path, "r") as f:
	config_dict = json.load(f)

	# del config_dict["training_args"]["_n_gpu"]
	del config_dict["data_config"]["meta_data"]
	del config_dict["data_config"]["data_dir"]

	return config_dict["data_config"], None, config_dict["model_config"], config_dict["peft_lora_config"], \
	config_dict["inference_config"] if "inference_config" in config_dict else None

	class VideoVLMRewardInference():
	def __init__(self, load_from_pretrained, load_from_pretrained_step=-1, device='cuda', dtype=torch.bfloat16):
	config_path = os.path.join(load_from_pretrained, "model_config.json")
	data_config, _, model_config, peft_lora_config, inference_config = load_configs_from_json(config_path)
	data_config = DataConfig(**data_config)
	model_config = ModelConfig(**model_config)
	peft_lora_config = PEFTLoraConfig(**peft_lora_config)

	training_args = TrainingConfig(
	load_from_pretrained=load_from_pretrained,
	load_from_pretrained_step=load_from_pretrained_step,
	gradient_checkpointing=False,
	disable_flash_attn2=False,
	bf16=True if dtype == torch.bfloat16 else False,
	fp16=True if dtype == torch.float16 else False,
	output_dir="",
	)

	model, processor, peft_config = create_model_and_processor(
	model_config=model_config,
	peft_lora_config=peft_lora_config,
	training_args=training_args,
	)

	self.device = device

	model, checkpoint_step = load_model_from_checkpoint(model, load_from_pretrained, load_from_pretrained_step)
	model.eval()

	self.model = model
	self.processor = processor

	self.model.to(self.device)

	self.data_config = data_config

	self.inference_config = inference_config

	def _norm(self, reward):
	if self.inference_config is None:
	return reward
	else:
	reward['VQ'] = (reward['VQ'] - self.inference_config['VQ_mean']) / self.inference_config['VQ_std']
	reward['MQ'] = (reward['MQ'] - self.inference_config['MQ_mean']) / self.inference_config['MQ_std']
	reward['TA'] = (reward['TA'] - self.inference_config['TA_mean']) / self.inference_config['TA_std']
	return reward

	def _pad_sequence(self, sequences, attention_mask, max_len, padding_side='right'):
	"""
	Pad the sequences to the maximum length.
	"""
	assert padding_side in ['right', 'left']
	if sequences.shape[1] >= max_len:
	return sequences, attention_mask

	pad_len = max_len - sequences.shape[1]
	padding = (0, pad_len) if padding_side == 'right' else (pad_len, 0)

	sequences_padded = torch.nn.functional.pad(sequences, padding, 'constant', self.processor.tokenizer.pad_token_id)
	attention_mask_padded = torch.nn.functional.pad(attention_mask, padding, 'constant', 0)

	return sequences_padded, attention_mask_padded

	def _prepare_input(self, data):
	"""
	Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
	handling potential state.
	"""
	if isinstance(data, Mapping):
	return type(data)({k: self._prepare_input(v) for k, v in data.items()})
	elif isinstance(data, (tuple, list)):
	return type(data)(self._prepare_input(v) for v in data)
	elif isinstance(data, torch.Tensor):
	kwargs = {"device": self.device}
	## TODO: Maybe need to add dtype
	# if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)):
	# # NLP models inputs are int/uint and those get adjusted to the right dtype of the
	# # embedding. Other models such as wav2vec2's inputs are already float and thus
	# # may need special handling to match the dtypes of the model
	# kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
	return data.to(**kwargs)
	return data

	def _prepare_inputs(self, inputs):
	"""
	Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
	handling potential state.
	"""
	inputs = self._prepare_input(inputs)
	if len(inputs) == 0:
	raise ValueError
	return inputs

	def prepare_batch_from_frames(self, video_tensors, prompts, fps=None, num_frames=None, max_pixels=None,):
	"""
	Args:
	video_tensors: List[torch.Tensor], shape [T, C, H, W]
	prompts: List[str]
	"""
	fps = self.data_config.fps if fps is None else fps
	num_frames = self.data_config.num_frames if num_frames is None else num_frames
	max_pixels = self.data_config.max_frame_pixels if max_pixels is None else max_pixels

	chat_data = [
	[
	{
	"role": "user",
	"content": [
	{"type": "video", "video": "file://dummy_path"},
	{"type": "text", "text": build_prompt(prompt, self.data_config.eval_dim, self.data_config.prompt_template_type)},
	],
	}
	] for prompt in prompts
	]

	processed_videos = []
	for tensor in video_tensors:
	processed = process_wanvideo_tensor(tensor)
	processed_videos.append(processed)
	video_tensors = processed_videos
	batch = self.processor(
	text=self.processor.apply_chat_template(chat_data, tokenize=False, add_generation_prompt=True),
	images=None,
	videos=video_tensors,
	padding=True,
	return_tensors="pt",
	videos_kwargs={"do_rescale": True},
	)

	return self._prepare_inputs(batch)


	def reward_from_frames(self, video_tensors, prompts, use_norm=True):
	batch = self.prepare_batch_from_frames(video_tensors, prompts)
	rewards = self.model(**batch, return_dict=True)["logits"] # [B, 3]

	if use_norm:
	vq = (rewards[:, 0] - self.inference_config['VQ_mean']) / self.inference_config['VQ_std']
	mq = (rewards[:, 1] - self.inference_config['MQ_mean']) / self.inference_config['MQ_std']
	ta = (rewards[:, 2] - self.inference_config['TA_mean']) / self.inference_config['TA_std']
	else:
	vq = rewards[:, 0]
	mq = rewards[:, 1]
	ta = rewards[:, 2]

	overall = (1 * mq + 1 * vq + 1 * ta)/3

	return {
	'VQ': vq,
	'MQ': mq,
	'TA': ta,
	'Overall': overall
	}

	def prepare_batch(self, video_paths, prompts, fps=None, num_frames=None, max_pixels=None,):
	fps = self.data_config.fps if fps is None else fps
	num_frames = self.data_config.num_frames if num_frames is None else num_frames
	max_pixels = self.data_config.max_frame_pixels if max_pixels is None else max_pixels

	if num_frames is None:
	chat_data = [
	[
	{
	"role": "user",
	"content": [
	{
	"type": "video",
	"video": f"file://{video_path}",
	"max_pixels": max_pixels,
	"fps": fps,
	"sample_type": self.data_config.sample_type,
	},
	{"type": "text", "text": build_prompt(prompt, self.data_config.eval_dim, self.data_config.prompt_template_type)},
	],
	},
	] for video_path, prompt in zip(video_paths, prompts)
	]
	else:
	chat_data = [
	[
	{
	"role": "user",
	"content": [
	{
	"type": "video",
	"video": f"file://{video_path}",
	"max_pixels": max_pixels,
	"nframes": num_frames,
	"sample_type": self.data_config.sample_type,
	},
	{"type": "text", "text": build_prompt(prompt, self.data_config.eval_dim, self.data_config.prompt_template_type)},
	],
	},
	] for video_path, prompt in zip(video_paths, prompts)
	]
	image_inputs, video_inputs = process_vision_info(chat_data)

	batch = self.processor(
	text=self.processor.apply_chat_template(chat_data, tokenize=False, add_generation_prompt=True),
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	videos_kwargs={"do_rescale": True},
	)
	batch = self._prepare_inputs(batch)
	return batch

	def reward(self, video_paths, prompts, fps=None, num_frames=None, max_pixels=None, use_norm=True):
	"""
	Inputs:
	video_paths: List[str], B paths of the videos.
	prompts: List[str], B prompts for the videos.
	eval_dims: List[str], N evaluation dimensions.
	fps: float, sample rate of the videos. If None, use the default value in the config.
	num_frames: int, number of frames of the videos. If None, use the default value in the config.
	max_pixels: int, maximum pixels of the videos. If None, use the default value in the config.
	use_norm: bool, whether to rescale the output rewards
	Outputs:
	Rewards: List[dict], N + 1 rewards of the B videos.
	"""
	assert fps is None or num_frames is None, "fps and num_frames cannot be set at the same time."

	batch = self.prepare_batch(video_paths, prompts, fps, num_frames, max_pixels)
	rewards = self.model(
	return_dict=True,
	**batch
	)["logits"]

	rewards = [{'VQ': reward[0].item(), 'MQ': reward[1].item(), 'TA': reward[2].item()} for reward in rewards]
	for i in range(len(rewards)):
	if use_norm:
	rewards[i] = self._norm(rewards[i])
	rewards[i]['Overall'] = rewards[i]['VQ'] + rewards[i]['MQ'] + rewards[i]['TA']

	return rewards


	if __name__ == "__main__":
	load_from_pretrained = "models/VideoReward"
	device = "cuda:0"
	dtype = torch.bfloat16

	inferencer = VideoVLMRewardInference(load_from_pretrained, device=device, dtype=dtype)

	video_paths = [
	"test/video1.mp4",
	"test/video2.mp4",
	"test/video3.mp4",
	]

	prompts = ["A stylish woman strolls down a bustling Tokyo street, the warm glow of neon lights and animated city signs casting vibrant reflections. She wears a sleek black leather jacket paired with a flowing red dress and black boots, her black purse slung over her shoulder. Sunglasses perched on her nose and a bold red lipstick add to her confident, casual demeanor. The street is damp and reflective, creating a mirror-like effect that enhances the colorful lights and shadows. Pedestrians move about, adding to the lively atmosphere. The scene is captured in a dynamic medium shot with the woman walking slightly to one side, highlighting her graceful strides.",
	"A stunning mid-afternoon landscape photograph with a low camera angle, showcasing several giant wooly mammoths treading through a snowy meadow. Their long, wooly fur gently billows in the brisk wind as they move, creating a sense of natural movement. Snow-covered trees and dramatic snow-capped mountains loom in the distance, adding to the majestic setting. Wispy clouds and a high sun cast a warm glow over the scene, enhancing the serene and awe-inspiring atmosphere. The depth of field brings out the detailed textures of the mammoths and the snowy environment, capturing every nuance of these prehistoric giants in breathtaking clarity.",
	"A movie trailer in a classic cinematic style, featuring the adventurous journey of a 30-year-old space man wearing a vibrant red wool knitted motorcycle helmet. The scene unfolds against a vast blue sky and a desolate salt desert landscape. Shot on 35mm film, the trailer showcases vivid and rich colors, capturing the hero as he navigates through the harsh terrain with determination. His helmet glints under the sun, adding to the dramatic effect. The background is a mix of sweeping desert vistas and distant horizons, with the occasional shimmer of light reflecting off the salt flats. A dynamic medium shot with a sweeping overhead angle, emphasizing the hero's resilience and the vastness of his adventure."]
	with torch.no_grad():
	rewards = inferencer.reward(video_paths, prompts, use_norm=True)
	print(rewards)

	def load_video_frames(video_path, num_frames=81):
	try:
	reader = imageio.get_reader(video_path)
	frames = []
	for i, frame in enumerate(reader):
	if i >= num_frames:
	break
	img = Image.fromarray(frame)
	frames.append(np.array(img))

	# 转换为张量 [T, H, W, C] -> [T, C, H, W]
	tensor = torch.tensor(np.stack(frames)).permute(0, 3, 1, 2)
	print(tensor.shape)
	return tensor
	except Exception as e:
	print(f"fail: {video_path}, error: {e}")
	return torch.zeros(num_frames, 3, 224, 224, dtype=torch.uint8)

	video_tensors = [load_video_frames(path) for path in video_paths]

	with torch.no_grad():
	rewards = inferencer.reward_from_frames(video_tensors, prompts, use_norm=True)
	print(rewards)