Bundle diffsynth library (no external repo dependency)

bc8c4af verified 11 days ago

10.7 kB

	import math
	import torch, torchvision, imageio, os
	import imageio.v3 as iio
	from PIL import Image
	import torchaudio


	class DataProcessingPipeline:
	def __init__(self, operators=None):
	self.operators: list[DataProcessingOperator] = [] if operators is None else operators

	def __call__(self, data):
	for operator in self.operators:
	data = operator(data)
	return data

	def __rshift__(self, pipe):
	if isinstance(pipe, DataProcessingOperator):
	pipe = DataProcessingPipeline([pipe])
	return DataProcessingPipeline(self.operators + pipe.operators)


	class DataProcessingOperator:
	def __call__(self, data):
	raise NotImplementedError("DataProcessingOperator cannot be called directly.")

	def __rshift__(self, pipe):
	if isinstance(pipe, DataProcessingOperator):
	pipe = DataProcessingPipeline([pipe])
	return DataProcessingPipeline([self]).__rshift__(pipe)


	class DataProcessingOperatorRaw(DataProcessingOperator):
	def __call__(self, data):
	return data


	class ToInt(DataProcessingOperator):
	def __call__(self, data):
	return int(data)


	class ToFloat(DataProcessingOperator):
	def __call__(self, data):
	return float(data)


	class ToStr(DataProcessingOperator):
	def __init__(self, none_value=""):
	self.none_value = none_value

	def __call__(self, data):
	if data is None: data = self.none_value
	return str(data)


	class LoadImage(DataProcessingOperator):
	def __init__(self, convert_RGB=True, convert_RGBA=False):
	self.convert_RGB = convert_RGB
	self.convert_RGBA = convert_RGBA

	def __call__(self, data: str):
	image = Image.open(data)
	if self.convert_RGB: image = image.convert("RGB")
	if self.convert_RGBA: image = image.convert("RGBA")
	return image


	class ImageCropAndResize(DataProcessingOperator):
	def __init__(self, height=None, width=None, max_pixels=None, height_division_factor=1, width_division_factor=1):
	self.height = height
	self.width = width
	self.max_pixels = max_pixels
	self.height_division_factor = height_division_factor
	self.width_division_factor = width_division_factor

	def crop_and_resize(self, image, target_height, target_width):
	width, height = image.size
	scale = max(target_width / width, target_height / height)
	image = torchvision.transforms.functional.resize(
	image,
	(round(heightscale), round(widthscale)),
	interpolation=torchvision.transforms.InterpolationMode.BILINEAR
	)
	image = torchvision.transforms.functional.center_crop(image, (target_height, target_width))
	return image

	def get_height_width(self, image):
	if self.height is None or self.width is None:
	width, height = image.size
	if width * height > self.max_pixels:
	scale = (width * height / self.max_pixels) ** 0.5
	height, width = int(height / scale), int(width / scale)
	height = height // self.height_division_factor * self.height_division_factor
	width = width // self.width_division_factor * self.width_division_factor
	else:
	height, width = self.height, self.width
	return height, width

	def __call__(self, data: Image.Image):
	image = self.crop_and_resize(data, *self.get_height_width(data))
	return image


	class ToList(DataProcessingOperator):
	def __call__(self, data):
	return [data]


	class FrameSamplerByRateMixin:
	def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_rate=24, fix_frame_rate=False):
	self.num_frames = num_frames
	self.time_division_factor = time_division_factor
	self.time_division_remainder = time_division_remainder
	self.frame_rate = frame_rate
	self.fix_frame_rate = fix_frame_rate

	def get_reader(self, data: str):
	return imageio.get_reader(data)

	def get_available_num_frames(self, reader):
	if not self.fix_frame_rate:
	return reader.count_frames()
	meta_data = reader.get_meta_data()
	total_original_frames = int(reader.count_frames())
	duration = meta_data["duration"] if "duration" in meta_data else total_original_frames / meta_data['fps']
	total_available_frames = math.floor(duration * self.frame_rate)
	return int(total_available_frames)

	def get_num_frames(self, reader):
	num_frames = self.num_frames
	total_frames = self.get_available_num_frames(reader)
	if int(total_frames) < num_frames:
	num_frames = total_frames
	while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
	num_frames -= 1
	return num_frames

	def map_single_frame_id(self, new_sequence_id: int, raw_frame_rate: float, total_raw_frames: int) -> int:
	if not self.fix_frame_rate:
	return new_sequence_id
	target_time_in_seconds = new_sequence_id / self.frame_rate
	raw_frame_index_float = target_time_in_seconds * raw_frame_rate
	frame_id = int(round(raw_frame_index_float))
	frame_id = min(frame_id, total_raw_frames - 1)
	return frame_id


	class LoadVideo(DataProcessingOperator, FrameSamplerByRateMixin):
	def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x, frame_rate=24, fix_frame_rate=False):
	FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
	# frame_processor is build in the video loader for high efficiency.
	self.frame_processor = frame_processor

	def __call__(self, data: str):
	reader = self.get_reader(data)
	raw_frame_rate = reader.get_meta_data()['fps']
	total_raw_frames = reader.count_frames()
	total_available = self.get_available_num_frames(reader)
	# Pad short videos with the last frame instead of reducing num_frames
	num_frames = self.num_frames
	frames = []
	for frame_id in range(num_frames):
	if frame_id < total_available:
	raw_id = self.map_single_frame_id(frame_id, raw_frame_rate, total_raw_frames)
	frame = reader.get_data(raw_id)
	frame = Image.fromarray(frame)
	frame = self.frame_processor(frame)
	frames.append(frame)
	else:
	# Pad with the last frame
	frames.append(frames[-1])
	reader.close()
	return frames


	class SequencialProcess(DataProcessingOperator):
	def __init__(self, operator=lambda x: x):
	self.operator = operator

	def __call__(self, data):
	return [self.operator(i) for i in data]


	class LoadGIF(DataProcessingOperator):
	def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x):
	self.num_frames = num_frames
	self.time_division_factor = time_division_factor
	self.time_division_remainder = time_division_remainder
	# frame_processor is build in the video loader for high efficiency.
	self.frame_processor = frame_processor

	def get_num_frames(self, path):
	num_frames = self.num_frames
	images = iio.imread(path, mode="RGB")
	if len(images) < num_frames:
	num_frames = len(images)
	while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
	num_frames -= 1
	return num_frames

	def __call__(self, data: str):
	num_frames = self.get_num_frames(data)
	frames = []
	images = iio.imread(data, mode="RGB")
	for img in images:
	frame = Image.fromarray(img)
	frame = self.frame_processor(frame)
	frames.append(frame)
	if len(frames) >= num_frames:
	break
	return frames


	class RouteByExtensionName(DataProcessingOperator):
	def __init__(self, operator_map):
	self.operator_map = operator_map

	def __call__(self, data: str):
	file_ext_name = data.split(".")[-1].lower()
	for ext_names, operator in self.operator_map:
	if ext_names is None or file_ext_name in ext_names:
	return operator(data)
	raise ValueError(f"Unsupported file: {data}")


	class RouteByType(DataProcessingOperator):
	def __init__(self, operator_map):
	self.operator_map = operator_map

	def __call__(self, data):
	for dtype, operator in self.operator_map:
	if dtype is None or isinstance(data, dtype):
	return operator(data)
	raise ValueError(f"Unsupported data: {data}")


	class LoadTorchPickle(DataProcessingOperator):
	def __init__(self, map_location="cpu"):
	self.map_location = map_location

	def __call__(self, data):
	return torch.load(data, map_location=self.map_location, weights_only=False)


	class ToAbsolutePath(DataProcessingOperator):
	def __init__(self, base_path=""):
	self.base_path = base_path

	def __call__(self, data):
	return os.path.join(self.base_path, data)


	class LoadAudio(DataProcessingOperator):
	def __init__(self, sr=16000):
	self.sr = sr
	def __call__(self, data: str):
	import librosa
	input_audio, sample_rate = librosa.load(data, sr=self.sr)
	return input_audio


	class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):

	def __init__(self, num_frames=121, time_division_factor=8, time_division_remainder=1, frame_rate=24, fix_frame_rate=True):
	FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)

	def __call__(self, data: str):
	reader = self.get_reader(data)
	num_frames = self.get_num_frames(reader)
	duration = num_frames / self.frame_rate
	waveform, sample_rate = torchaudio.load(data)
	target_samples = int(duration * sample_rate)
	current_samples = waveform.shape[-1]
	if current_samples > target_samples:
	waveform = waveform[..., :target_samples]
	elif current_samples < target_samples:
	padding = target_samples - current_samples
	waveform = torch.nn.functional.pad(waveform, (0, padding))
	return waveform, sample_rate