Audio2Image / Wan2.2 /wan /utils /utils.py

added model and weights

40cfce6 3 months ago

7.27 kB

	# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
	import argparse
	import binascii
	import logging
	import os
	import os.path as osp
	import shutil
	import subprocess

	import imageio
	import torch
	import torchvision

	__all__ = ['save_video', 'save_image', 'str2bool']


	def rand_name(length=8, suffix=''):
	name = binascii.b2a_hex(os.urandom(length)).decode('utf-8')
	if suffix:
	if not suffix.startswith('.'):
	suffix = '.' + suffix
	name += suffix
	return name


	def merge_video_audio(video_path: str, audio_path: str):
	"""
	Merge the video and audio into a new video, with the duration set to the shorter of the two,
	and overwrite the original video file.

	Parameters:
	video_path (str): Path to the original video file
	audio_path (str): Path to the audio file
	"""
	# set logging
	logging.basicConfig(level=logging.INFO)

	# check
	if not os.path.exists(video_path):
	raise FileNotFoundError(f"video file {video_path} does not exist")
	if not os.path.exists(audio_path):
	raise FileNotFoundError(f"audio file {audio_path} does not exist")

	base, ext = os.path.splitext(video_path)
	temp_output = f"{base}_temp{ext}"

	try:
	# create ffmpeg command
	command = [
	'ffmpeg',
	'-y', # overwrite
	'-i',
	video_path,
	'-i',
	audio_path,
	'-c:v',
	'copy', # copy video stream
	'-c:a',
	'aac', # use AAC audio encoder
	'-b:a',
	'192k', # set audio bitrate (optional)
	'-map',
	'0:v:0', # select the first video stream
	'-map',
	'1:a:0', # select the first audio stream
	'-shortest', # choose the shortest duration
	temp_output
	]

	# execute the command
	logging.info("Start merging video and audio...")
	result = subprocess.run(
	command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

	# check result
	if result.returncode != 0:
	error_msg = f"FFmpeg execute failed: {result.stderr}"
	logging.error(error_msg)
	raise RuntimeError(error_msg)

	shutil.move(temp_output, video_path)
	logging.info(f"Merge completed, saved to {video_path}")

	except Exception as e:
	if os.path.exists(temp_output):
	os.remove(temp_output)
	logging.error(f"merge_video_audio failed with error: {e}")


	def save_video(tensor,
	save_file=None,
	fps=30,
	suffix='.mp4',
	nrow=8,
	normalize=True,
	value_range=(-1, 1)):
	# cache file
	cache_file = osp.join('/tmp', rand_name(
	suffix=suffix)) if save_file is None else save_file

	# save to cache
	try:
	# preprocess
	tensor = tensor.clamp(min(value_range), max(value_range))
	tensor = torch.stack([
	torchvision.utils.make_grid(
	u, nrow=nrow, normalize=normalize, value_range=value_range)
	for u in tensor.unbind(2)
	],
	dim=1).permute(1, 2, 3, 0)
	tensor = (tensor * 255).type(torch.uint8).cpu()

	# write video
	writer = imageio.get_writer(
	cache_file, fps=fps, codec='libx264', quality=8)
	for frame in tensor.numpy():
	writer.append_data(frame)
	writer.close()
	except Exception as e:
	logging.info(f'save_video failed, error: {e}')


	def save_image(tensor, save_file, nrow=8, normalize=True, value_range=(-1, 1)):
	# cache file
	suffix = osp.splitext(save_file)[1]
	if suffix.lower() not in [
	'.jpg', '.jpeg', '.png', '.tiff', '.gif', '.webp'
	]:
	suffix = '.png'

	# save to cache
	try:
	tensor = tensor.clamp(min(value_range), max(value_range))
	torchvision.utils.save_image(
	tensor,
	save_file,
	nrow=nrow,
	normalize=normalize,
	value_range=value_range)
	return save_file
	except Exception as e:
	logging.info(f'save_image failed, error: {e}')


	def str2bool(v):
	"""
	Convert a string to a boolean.

	Supported true values: 'yes', 'true', 't', 'y', '1'
	Supported false values: 'no', 'false', 'f', 'n', '0'

	Args:
	v (str): String to convert.

	Returns:
	bool: Converted boolean value.

	Raises:
	argparse.ArgumentTypeError: If the value cannot be converted to boolean.
	"""
	if isinstance(v, bool):
	return v
	v_lower = v.lower()
	if v_lower in ('yes', 'true', 't', 'y', '1'):
	return True
	elif v_lower in ('no', 'false', 'f', 'n', '0'):
	return False
	else:
	raise argparse.ArgumentTypeError('Boolean value expected (True/False)')


	def masks_like(tensor, zero=False, generator=None, p=0.2):
	assert isinstance(tensor, list)
	out1 = [torch.ones(u.shape, dtype=u.dtype, device=u.device) for u in tensor]

	out2 = [torch.ones(u.shape, dtype=u.dtype, device=u.device) for u in tensor]

	if zero:
	if generator is not None:
	for u, v in zip(out1, out2):
	random_num = torch.rand(
	1, generator=generator, device=generator.device).item()
	if random_num < p:
	u[:, 0] = torch.normal(
	mean=-3.5,
	std=0.5,
	size=(1,),
	device=u.device,
	generator=generator).expand_as(u[:, 0]).exp()
	v[:, 0] = torch.zeros_like(v[:, 0])
	else:
	u[:, 0] = u[:, 0]
	v[:, 0] = v[:, 0]
	else:
	for u, v in zip(out1, out2):
	u[:, 0] = torch.zeros_like(u[:, 0])
	v[:, 0] = torch.zeros_like(v[:, 0])

	return out1, out2


	def best_output_size(w, h, dw, dh, expected_area):
	# float output size
	ratio = w / h
	ow = (expected_area * ratio)**0.5
	oh = expected_area / ow

	# process width first
	ow1 = int(ow // dw * dw)
	oh1 = int(expected_area / ow1 // dh * dh)
	assert ow1 % dw == 0 and oh1 % dh == 0 and ow1 * oh1 <= expected_area
	ratio1 = ow1 / oh1

	# process height first
	oh2 = int(oh // dh * dh)
	ow2 = int(expected_area / oh2 // dw * dw)
	assert oh2 % dh == 0 and ow2 % dw == 0 and ow2 * oh2 <= expected_area
	ratio2 = ow2 / oh2

	# compare ratios
	if max(ratio / ratio1, ratio1 / ratio) < max(ratio / ratio2,
	ratio2 / ratio):
	return ow1, oh1
	else:
	return ow2, oh2


	def download_cosyvoice_repo(repo_path):
	try:
	import git
	except ImportError:
	raise ImportError('failed to import git, please run pip install GitPython')
	repo = git.Repo.clone_from('https://github.com/FunAudioLLM/CosyVoice.git', repo_path, multi_options=['--recursive'], branch='main')


	def download_cosyvoice_model(model_name, model_path):
	from modelscope import snapshot_download
	snapshot_download('iic/{}'.format(model_name), local_dir=model_path)