Spaces:

Baraaqasem
/

Imag

Runtime error

App Files Files Community

Imag / src /videogen_hub /metrics /xclipscore_metric.py

Baraaqasem

Upload 49 files

413d4d0 verified about 1 year ago

raw

history blame contribute delete

2.69 kB

	import numpy as np
	from PIL import Image
	import torch.nn.functional as F
	from typing import List
	from transformers import AutoTokenizer, AutoModel, AutoProcessor

	NUM_ASPECT=5
	ROUND_DIGIT=3
	MAX_LENGTH = 76

	MAX_NUM_FRAMES=8

	X_CLIP_POINT_LOW=0.15
	X_CLIP_POINT_MID=0.225
	X_CLIP_POINT_HIGH=0.30


	def _read_video_frames(frames, max_frames):
	total_frames = len(frames)
	indices = np.linspace(0, total_frames - 1, num=max_frames).astype(int)

	selected_frames = [np.array(frames[i]) for i in indices]
	return np.stack(selected_frames)


	class MetricXCLIPScore():
	def __init__(self, device="cuda") -> None:
	"""
	Initialize a MetricXCLIPScore object with the specified device.

	Args:
	device (str, optional): The device on which the model will run. Defaults to "cuda".
	"""

	self.model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
	self.processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
	self.tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")

	def evaluate(self, frame_list:List[Image.Image], text:str,):
	"""
	Calculate the cosine similarity of between X-CLIP features of text prompt and the given video to test text-to-video alignment,
	then quantize the orginal output based on some predefined thresholds.

	Args:
	frame_list:List[Image.Image], frames of the video used in calculation.
	text:str, text prompt for generating the video.

	Returns:
	xclip_score_avg: float, the computed X-CLIP-Score between video and its text prompt.
	quantized_ans: int, the quantized value of the above avg SSIM scores based on pre-defined thresholds.
	"""

	input_text = self.tokenizer([text], max_length=MAX_LENGTH, truncation=True, padding=True, return_tensors="pt")
	text_feature = self.model.get_text_features(**input_text).flatten()

	video=_read_video_frames(frame_list,MAX_NUM_FRAMES)

	input_video = self.processor(videos=list(video), return_tensors="pt")
	video_feature = self.model.get_video_features(**input_video).flatten()
	cos_sim=F.cosine_similarity(text_feature, video_feature, dim=0).item()
	quantized_ans=0
	if cos_sim < X_CLIP_POINT_LOW:
	quantized_ans=1
	elif cos_sim >= X_CLIP_POINT_LOW and cos_sim < X_CLIP_POINT_MID:
	quantized_ans=2
	elif cos_sim >= X_CLIP_POINT_MID and cos_sim < X_CLIP_POINT_HIGH:
	quantized_ans=3
	else:
	quantized_ans=4
	return cos_sim, quantized_ans