Ming-Lite-Omni / bailingmm_utils.py

LandyGuo

update v20250516 ckpts

dd5f831 11 months ago

22.1 kB

	from __future__ import annotations

	import base64
	import logging
	import math
	import os
	import sys
	import time
	import warnings
	from functools import lru_cache
	from io import BytesIO

	import random
	import numpy as np

	import requests
	import torch
	import torchvision
	from packaging import version

	from PIL import Image
	import torchaudio
	from torchvision import io, transforms
	from torchvision.transforms import InterpolationMode
	from typing import Union, Tuple, List

	logger = logging.getLogger(__name__)

	IMAGE_FACTOR = 28
	MIN_PIXELS = 4 * 28 * 28
	MAX_PIXELS = 1024 * 28 * 28
	MAX_RATIO = 200

	VIDEO_MIN_PIXELS = 128 * 28 * 28
	VIDEO_MAX_PIXELS = 768 * 28 * 28 # 4: 3 => 32: 24 (768) \| 16:9 => 32:18 (576)
	VIDEO_TOTAL_PIXELS = 9216 * 28 * 28 # 9216: 24-72 frames \| 7680: 10-60 frames \| 6144: 8-48 frames

	FRAME_FACTOR = 2
	FPS = 2.0
	FPS_MIN_FRAMES = 4
	FPS_MAX_FRAMES = 128

	def is_decord_available() -> bool:
	import importlib.util
	return importlib.util.find_spec("decord") is not None

	def round_by_factor(number: int, factor: int) -> int:
	"""Returns the closest integer to 'number' that is divisible by 'factor'."""
	return round(number / factor) * factor

	def ceil_by_factor(number: int, factor: int) -> int:
	"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
	return math.ceil(number / factor) * factor

	def floor_by_factor(number: int, factor: int) -> int:
	"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
	return math.floor(number / factor) * factor

	def is_image(image_file):
	if isinstance(image_file, str) and (image_file.startswith("base64,") or image_file.lower().endswith(
	('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff'))):
	return True
	elif isinstance(image_file, Image.Image):
	return True
	else:
	return False

	def is_video(video_file):
	if isinstance(video_file, str) and video_file.lower().endswith(
	('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")):
	return True
	else:
	return False

	def is_audio(audio_file):
	if isinstance(audio_file, str) and audio_file.lower().endswith(
	(".wav", ".mp3", ".aac", ".flac", ".alac", ".m4a", ".ogg", ".wma", ".aiff", ".amr", ".au")):
	return True
	else:
	return False

	def smart_resize(
	height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
	) -> tuple[int, int]:
	"""
	Rescales the image so that the following conditions are met:

	1. Both dimensions (height and width) are divisible by 'factor'.

	2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].

	3. The aspect ratio of the image is maintained as closely as possible.
	"""
	if max(height, width) / min(height, width) > MAX_RATIO:
	raise ValueError(
	f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
	)
	h_bar = max(factor, round_by_factor(height, factor))
	w_bar = max(factor, round_by_factor(width, factor))
	if h_bar * w_bar > max_pixels:
	beta = math.sqrt((height * width) / max_pixels)
	h_bar = floor_by_factor(height / beta, factor)
	w_bar = floor_by_factor(width / beta, factor)
	elif h_bar * w_bar < min_pixels:
	beta = math.sqrt(min_pixels / (height * width))
	h_bar = ceil_by_factor(height * beta, factor)
	w_bar = ceil_by_factor(width * beta, factor)
	return h_bar, w_bar

	def fetch_image(ele: dict[str, str \| Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
	if "image" in ele:
	image = ele["image"]
	else:
	image = ele["image_url"]
	image_obj = None
	if isinstance(image, Image.Image):
	image_obj = image
	elif image.startswith("http://") or image.startswith("https://"):
	image_obj = Image.open(requests.get(image, stream=True).raw)
	elif image.startswith("file://"):
	image_obj = Image.open(image[7:])
	elif image.startswith("data:image"):
	if "base64," in image:
	_, base64_data = image.split("base64,", 1)
	data = base64.b64decode(base64_data)
	image_obj = Image.open(BytesIO(data))
	else:
	image_obj = Image.open(image)
	if image_obj is None:
	raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
	image = image_obj.convert("RGB")
	## resize
	if "resized_height" in ele and "resized_width" in ele:
	resized_height, resized_width = smart_resize(
	ele["resized_height"],
	ele["resized_width"],
	factor=size_factor,
	)
	else:
	width, height = image.size
	min_pixels = ele.get("min_pixels", MIN_PIXELS)
	max_pixels = ele.get("max_pixels", MAX_PIXELS)
	resized_height, resized_width = smart_resize(
	height,
	width,
	factor=size_factor,
	min_pixels=min_pixels,
	max_pixels=max_pixels,
	)
	image = image.resize((resized_width, resized_height))

	return image

	def sample_frames(num_frames, total_frames, sample="random"):
	if sample == "sequence":
	frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
	else:
	intervals = np.linspace(start=0, stop=total_frames, num=num_frames + 1, dtype=int)
	ranges = []
	for idx, interv in enumerate(intervals[:-1]):
	ranges.append((interv, intervals[idx + 1] - 1))
	if sample == "random":
	try:
	frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
	except:
	frame_indices = np.random.permutation(total_frames)[:num_frames]
	frame_indices.sort()
	frame_indices = list(frame_indices)
	if len(frame_indices) < num_frames:
	padded_frame_indices = [frame_indices[-1]] * num_frames
	padded_frame_indices[:len(frame_indices)] = frame_indices
	frame_indices = padded_frame_indices
	elif sample == "uniform" or sample == "adaptive":
	frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
	if len(frame_indices) < num_frames:
	frame_indices = [
	frame_indices[int((num_frames - 1) * i / (num_frames - 1) + 0.5)] for i in range(num_frames)
	]
	else:
	raise NotImplementedError
	return frame_indices

	def get_frames(
	ele: dict,
	total_frames: int,
	) -> int:
	"""calculate the number of frames for video used for model inputs.
	Args:
	ele (dict): a dict contains the configuration of video.
	total_frames (int): the original total number of frames of the video.
	Returns:
	int: the number of frames for video used for model inputs.
	"""
	if "nframes" in ele:
	num_frames = round_by_factor(ele["nframes"], FRAME_FACTOR)
	else:
	min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
	max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
	num_frames = max(min(total_frames, max_frames), min_frames)
	num_frames = floor_by_factor(num_frames, FRAME_FACTOR)

	if not (FRAME_FACTOR <= num_frames <= total_frames):
	raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {num_frames}.")
	return num_frames

	def _read_video_torchvision(
	ele: dict,
	) -> (torch.Tensor, float):
	"""read video using torchvision.io.read_video
	Args:
	ele (dict): a dict contains the configuration of video.
	support keys:
	- video: the path of video. support "file://", "http://", "https://" and local path.
	- video_start: the start time of video.
	- video_end: the end time of video.
	Returns:
	torch.Tensor: the video tensor with shape (T, C, H, W).
	"""
	video_path = ele["video"]
	if version.parse(torchvision.__version__) < version.parse("0.19.0"):
	if "http://" in video_path or "https://" in video_path:
	warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
	if "file://" in video_path:
	video_path = video_path[7:]

	sample_method = ele.get("sample", "sequence")
	pts_unit = "sec" if sample_method == "sequence" else "pts"
	st = time.time()
	video, audio, info = io.read_video(
	video_path,
	start_pts=ele.get("video_start", 0.0),
	end_pts=ele.get("video_end", None),
	pts_unit=pts_unit,
	output_format="TCHW",
	)
	total_frames, video_fps = video.size(0), info["video_fps"]
	logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")

	num_frames = get_frames(ele, total_frames)
	frame_indices = sample_frames(
	num_frames=num_frames, total_frames=total_frames, sample=sample_method
	)
	video = video[frame_indices]
	sample_fps = num_frames / max(total_frames, 1e-6) * video_fps
	return video, sample_fps

	def _read_video_decord(
	ele: dict,
	) -> (torch.Tensor, float):
	"""read video using decord.VideoReader

	Args:
	ele (dict): a dict contains the configuration of video.
	support keys:
	- video: the path of video. support "file://", "http://", "https://" and local path.
	- video_start: the start time of video.
	- video_end: the end time of video.
	Returns:
	torch.Tensor: the video tensor with shape (T, C, H, W).
	"""
	import decord
	video_path = ele["video"]

	st = time.time()
	vr = decord.VideoReader(video_path)
	if 'video_start' in ele or 'video_end' in ele:
	raise NotImplementedError("not support start_pts and end_pts in decord for now.")
	total_frames, video_fps = len(vr), vr.get_avg_fps()
	logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")

	sample_method = ele.get("sample", "sequence")
	# if sample_method == "sequence":
	# total_frames = int(total_frames / video_fps * 2)
	if video_fps > 2.0 and total_frames / float(video_fps) > 5.0:
	num_frames = get_frames(ele, int(total_frames / float(video_fps) * 2))
	else:
	num_frames = get_frames(ele, total_frames)
	frame_indices = sample_frames(
	num_frames=num_frames, total_frames=total_frames, sample=sample_method
	)
	if sample_method == "adaptive" and len(frame_indices) > 64:
	frames_indices_selected = select_frames_based_on_query(vr, frame_indices, ele) # query的扩模态采样结果
	indices = np.linspace(0, len(frame_indices) - 1, len(frame_indices)//2, dtype=int)
	frame_indices = np.array(frame_indices)[indices].tolist()
	frames_indices_selected_sort = np.sort(frame_indices + frames_indices_selected[:(num_frames - len(frame_indices))].tolist()).tolist()
	video = vr.get_batch(frames_indices_selected_sort).asnumpy()
	else:
	video = vr.get_batch(frame_indices).asnumpy()

	# video = vr.get_batch(frame_indices).asnumpy()
	video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
	sample_fps = num_frames / max(total_frames, 1e-6) * video_fps
	return video, sample_fps

	def select_frames_based_on_query(vr, frame_indices, ele):
	import sys
	sys.path.join("./longvu")
	'''
	This LongVU model (https://github.com/Vision-CAIR/LongVU) computes cross-modal relevance
	between user queries and video frames for the purpose of frame selection.
	It can also be replaced with other text/visual encoders to achieve the same effect.
	To maintain consistency in the repository structure, this module has not been included in the repository directory for now.
	If needed for evaluation, simply import this module.
	'''
	from longvu.constants import (
	DEFAULT_IMAGE_TOKEN,
	IMAGE_TOKEN_INDEX,
	)
	from longvu.conversation import conv_templates, SeparatorStyle
	from longvu.mm_datautils import (
	KeywordsStoppingCriteria,
	process_images,
	tokenizer_image_token,
	)
	tokenizer, model, image_processor = ele["tokenizer"], ele["model"], ele["image_processor"]

	# 考虑在这里扩展frame_indices
	video = vr.get_batch(frame_indices).asnumpy() # (21, 320, 568, 3)

	image_sizes = [video[0].shape[:2]] # [(320, 568)]
	video = process_images(video, image_processor, model.config) # len(video)=2, 第一个 torch.Size([623, 3, 384, 384])，第二个 torch.Size([623, 3, 378, 378])
	video = [item.unsqueeze(0) for item in video] # len(video)=2, 第一个 torch.Size([1, 623, 3, 384, 384])，第二个 torch.Size([1, 623, 3, 378, 378])

	qs = DEFAULT_IMAGE_TOKEN + "\n" + ele["text"]
	conv = conv_templates["qwen"].copy()
	conv.append_message(conv.roles[0], qs)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()

	input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device) # torch.Size([1, 26])
	stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 # '<\|im_end\|>'
	keywords = [stop_str]
	stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

	with torch.inference_mode():
	output_ids = model.generate(
	input_ids,
	images=video,
	image_sizes=image_sizes,
	do_sample=False,
	temperature=0.2,
	max_new_tokens=128,
	use_cache=True,
	stopping_criteria=[stopping_criteria],
	) # torch.Size([1, 128])

	selected_indices = np.array(frame_indices)[output_ids.cpu().numpy()]
	return selected_indices

	VIDEO_READER_BACKENDS = {
	"decord": _read_video_decord,
	"torchvision": _read_video_torchvision,
	}

	FORCE_BAILINGNATIVE_VIDEO_READER = os.getenv("FORCE_BAILINGNATIVE_VIDEO_READER", None)

	@lru_cache(maxsize=1)
	def get_video_reader_backend() -> str:
	if FORCE_BAILINGNATIVE_VIDEO_READER is not None:
	video_reader_backend = FORCE_BAILINGNATIVE_VIDEO_READER
	elif is_decord_available():
	video_reader_backend = "decord"
	else:
	video_reader_backend = "torchvision"
	print(f"bailing-native-utils using {video_reader_backend} to read video.", file=sys.stderr)
	return video_reader_backend

	def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor \| \
	list[
	Image.Image]:
	if isinstance(ele["video"], str):
	if ele["video"].startswith("file://"):
	ele["video"] = ele["video"][7:]
	video_reader_backend = get_video_reader_backend()
	try:
	video, sample_fps = VIDEO_READER_BACKENDS[video_reader_backend](ele)
	except Exception as e:
	logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}")
	video, sample_fps = VIDEO_READER_BACKENDS["torchvision"](ele)

	if "resized_height" in ele and "resized_width" in ele:
	resized_height, resized_width = smart_resize(
	ele["resized_height"],
	ele["resized_width"],
	factor=image_factor,
	)
	else:
	num_frames, _, height, width = video.shape
	min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
	total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
	max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / num_frames * FRAME_FACTOR), int(min_pixels * 1.05))
	max_pixels_supposed = ele.get("max_pixels", max_pixels)
	if max_pixels_supposed > max_pixels:
	logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
	max_pixels = min(max_pixels_supposed, max_pixels)

	resized_height, resized_width = smart_resize(
	height,
	width,
	factor=28,
	min_pixels=min_pixels,
	max_pixels=max_pixels,
	)
	video = transforms.functional.resize(
	video,
	[resized_height, resized_width],
	interpolation=InterpolationMode.BICUBIC,
	antialias=True,
	).float()
	if return_video_sample_fps:
	return video, sample_fps
	return video
	else:
	assert isinstance(ele["video"], (list, tuple))
	process_info = ele.copy()
	process_info.pop("type", None)
	process_info.pop("video", None)
	images = [
	fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
	for video_element in ele["video"]
	]
	# if len(images) > ele["max_frames"]:
	# num_frames_target = ele["max_frames"]
	# print(ele["max_frames"])
	# interval = len(images) // num_frames_target # 计算抽取间隔
	# images = [images[i] for i in range(0, len(images), interval)][:num_frames_target]
	num_frames = ceil_by_factor(len(images), FRAME_FACTOR)
	if len(images) < num_frames:
	images.extend([images[-1]] * (num_frames - len(images)))
	if len(images) > ele["max_frames"]:
	frame_indices = sample_frames(
	num_frames=ele["max_frames"], total_frames=len(images), sample="uniform",
	)
	images = [images[i] for i in frame_indices]
	if return_video_sample_fps:
	return images, process_info.pop("sample_fps", 2.0)
	return images

	def fetch_audio(ele: dict[str, str \| torch.Tensor], return_tensor="pt") -> Tuple[Union[torch.Tensor, np.ndarray], int]:
	if "audio" in ele:
	audio = ele["audio"]
	else:
	audio = ele["audio_url"]

	if isinstance(audio, torch.Tensor):
	waveform = audio
	sample_rate: int = ele.get("sample_rate", 16000)
	elif audio.startswith("http://") or audio.startswith("https://"):
	audio_file = BytesIO(requests.get(audio, stream=True).content)
	waveform, sample_rate = torchaudio.load(audio_file)
	elif audio.startswith("file://"):
	waveform, sample_rate = torchaudio.load(audio[7:])
	else:
	waveform, sample_rate = torchaudio.load(audio)
	if return_tensor == "pt":
	return waveform, sample_rate
	else:
	return waveform.numpy(), sample_rate

	def extract_vision_info(conversations: list[dict] \| list[list[dict]]) -> list[dict]:
	vision_infos = []
	if isinstance(conversations[0], dict):
	conversations = [conversations]
	for conversation in conversations:
	for message in conversation:
	if isinstance(message["content"], list):
	for ele in message["content"]:
	if (
	"image" in ele
	or "image_url" in ele
	or "video" in ele
	or "audio" in ele
	or ele["type"] in ("image", "image_url", "video")
	):
	vision_infos.append(ele)
	# 把视频的 query_text 也加进来
	if "text" in ele: text = ele["text"]
	if "video" in ele and ele["sample"] == "adaptive":
	tokenizer = ele["tokenizer"]
	model = ele["model"]
	image_processor = ele["image_processor"]
	for ele in vision_infos:
	if "video" in ele and ele["sample"] == "adaptive":
	ele["text"] = text
	ele["tokenizer"] = tokenizer
	ele["model"] = model
	ele["image_processor"] = image_processor
	return vision_infos
	return vision_infos

	def process_vision_info(
	conversations: list[dict] \| list[list[dict]],
	) -> tuple[list[Image.Image] \| None, list[torch.Tensor \| list[Image.Image]] \| None, list[
	torch.Tensor \| list[np.ndarray]] \| None]:
	vision_infos = extract_vision_info(conversations)
	## Read images, videos or audios
	image_inputs = []
	video_inputs = []
	audio_inputs = []
	for vision_info in vision_infos:
	if "image" in vision_info or "image_url" in vision_info:
	if isinstance(vision_info["image"], (tuple, list)):
	for i in range(len(vision_info["image"])):
	image_inputs.append(fetch_image({"type": "image", "image": vision_info["image"][i]}))
	else:
	image_inputs.append(fetch_image(vision_info))
	elif "video" in vision_info or "video_url" in vision_info:
	if is_video(vision_info['video']):
	data_value = vision_info['video']
	else:
	data_value = [os.path.join(vision_info['video'], frame) for frame in os.listdir(vision_info['video'])]
	vision_info['video']=data_value
	video_inputs.append(fetch_video(vision_info))
	elif "audio" in vision_info or "audio_url" in vision_info:
	if isinstance(vision_info["audio"], (tuple, list)):
	audio_inputs.extend(fetch_audio(info) for info in vision_info["audio"])
	else:
	audio_inputs.append(fetch_audio(vision_info))
	else:
	raise ValueError("image, image_url, video, video_url, audio or audio_url should in content.")
	if len(image_inputs) == 0:
	image_inputs = None
	if len(video_inputs) == 0:
	video_inputs = None
	if len(audio_inputs) == 0:
	audio_inputs = None
	return image_inputs, video_inputs, audio_inputs