Spaces:

hololens
/

stable-diffusion-webui-depthmap-script

Runtime error

App Files Files Community

stable-diffusion-webui-depthmap-script / src /core.py

hololens

Upload folder using huggingface_hub

e04dce3 verified about 1 year ago

raw

history blame

34.2 kB

	from pathlib import Path
	from PIL import Image

	try:
	from tqdm import trange
	except:
	from builtins import range as trange

	import torch, gc
	import cv2
	import os.path
	import numpy as np
	import copy
	import platform
	import math

	# Our code
	from src.misc import *
	from src.common_constants import GenerationOptions as go
	from src.common_constants import *
	from src.stereoimage_generation import create_stereoimages
	from src.normalmap_generation import create_normalmap
	from src.depthmap_generation import ModelHolder
	from src import backbone

	try:
	# 3d-photo-inpainting imports
	from inpaint.mesh import write_mesh, read_mesh, output_3d_photo
	from inpaint.networks import Inpaint_Color_Net, Inpaint_Depth_Net, Inpaint_Edge_Net
	from inpaint.utils import path_planning
	from inpaint.bilateral_filtering import sparse_bilateral_filtering
	except Exception as e:
	print('Impaint import failed. Impaint will not work.')
	import traceback
	traceback.print_exc()

	global video_mesh_data, video_mesh_fn
	video_mesh_data = None
	video_mesh_fn = None

	model_holder = ModelHolder()


	def convert_to_i16(arr):
	# Single channel, 16 bit image. This loses some precision!
	# uint16 conversion uses round-down, therefore values should be [0; 2**16)
	numbytes = 2
	max_val = (2 ** (8 * numbytes))
	out = np.clip(arr * max_val + 0.0001, 0, max_val - 0.1) # -0.1 from above is needed to avoid overflowing
	return out.astype("uint16")

	def convert_i16_to_rgb(image, like):
	# three channel, 8 bits per channel image
	output = np.zeros_like(like)
	output[:, :, 0] = image / 256.0
	output[:, :, 1] = image / 256.0
	output[:, :, 2] = image / 256.0
	return output


	class CoreGenerationFunnelInp:
	"""This class takes a dictionary and creates a core_generation_funnel inp.
	Non-applicable parameters are silently discarded (no error)"""
	def __init__(self, values):
	if isinstance(values, CoreGenerationFunnelInp):
	values = values.values
	values = {(k.name if isinstance(k, GenerationOptions) else k).lower(): v for k, v in values.items()}

	self.values = {}
	for setting in GenerationOptions:
	name = setting.name.lower()
	self.values[name] = values[name] if name in values else setting.df

	def __getitem__(self, item):
	if isinstance(item, GenerationOptions):
	return self.values[item.name.lower()]
	return self.values[item]

	def __getattr__(self, item):
	return self[item]


	def core_generation_funnel(outpath, inputimages, inputdepthmaps, inputnames, inp, ops=None):
	if len(inputimages) == 0 or inputimages[0] is None:
	return
	if inputdepthmaps is None or len(inputdepthmaps) == 0:
	inputdepthmaps: list[Image] = [None for _ in range(len(inputimages))]
	inputdepthmaps_complete = all([x is not None for x in inputdepthmaps])

	inp = CoreGenerationFunnelInp(inp)

	if ops is None:
	ops = backbone.gather_ops()
	model_holder.update_settings(**ops)

	# TODO: ideally, run_depthmap should not save meshes - that makes the function not pure
	print(SCRIPT_FULL_NAME)
	print(f'Backbone: {backbone.USED_BACKBONE.name}')

	backbone.unload_sd_model()

	# TODO: this still should not be here
	background_removed_images = []
	# remove on base image before depth calculation
	if inp[go.GEN_REMBG]:
	if inp[go.PRE_DEPTH_BACKGROUND_REMOVAL]:
	inputimages = batched_background_removal(inputimages, inp[go.REMBG_MODEL])
	background_removed_images = inputimages
	else:
	background_removed_images = batched_background_removal(inputimages, inp[go.REMBG_MODEL])

	# init torch device
	if inp[go.COMPUTE_DEVICE] == 'GPU':
	if torch.cuda.is_available():
	device = torch.device("cuda")
	else:
	print('WARNING: Cuda device was not found, cpu will be used')
	device = torch.device("cpu")
	else:
	device = torch.device("cpu")
	print("device: %s" % device)

	# TODO: This should not be here
	inpaint_imgs = []
	inpaint_depths = []

	try:
	if not inputdepthmaps_complete:
	print("Loading model(s) ..")
	model_holder.ensure_models(inp[go.MODEL_TYPE], device, inp[go.BOOST], inp[go.TILING_MODE])
	print("Computing output(s) ..")
	# iterate over input images
	for count in trange(0, len(inputimages)):
	# Convert single channel input (PIL) images to rgb
	if inputimages[count].mode == 'I':
	inputimages[count].point(lambda p: p * 0.0039063096, mode='RGB')
	inputimages[count] = inputimages[count].convert('RGB')

	raw_prediction = None
	"""Raw prediction, as returned by a model. None if input depthmap is used."""
	raw_prediction_invert = False
	"""True if near=dark on raw_prediction"""
	out = None

	if inputdepthmaps is not None and inputdepthmaps[count] is not None:
	# use custom depthmap
	dp = inputdepthmaps[count]
	if isinstance(dp, Image.Image):
	if dp.width != inputimages[count].width or dp.height != inputimages[count].height:
	try: # LANCZOS may fail on some formats
	dp = dp.resize((inputimages[count].width, inputimages[count].height), Image.Resampling.LANCZOS)
	except:
	dp = dp.resize((inputimages[count].width, inputimages[count].height))
	# Trying desperately to rescale image to [0;1) without actually normalizing it
	# Normalizing is avoided, because we want to preserve the scale of the original depthmaps
	# (batch mode, video mode).
	if len(dp.getbands()) == 1:
	out = np.asarray(dp, dtype="float")
	out_max = out.max()
	if out_max < 256:
	bit_depth = 8
	elif out_max < 65536:
	bit_depth = 16
	else:
	bit_depth = 32
	out /= 2.0 ** bit_depth
	else:
	out = np.asarray(dp, dtype="float")[:, :, 0]
	out /= 256.0
	else:
	# Should be in interval [0; 1], values outside of this range will be clipped.
	out = np.asarray(dp, dtype="float")
	assert inputimages[count].height == out.shape[0], "Custom depthmap height mismatch"
	assert inputimages[count].width == out.shape[1], "Custom depthmap width mismatch"
	else:
	# override net size (size may be different for different images)
	if inp[go.NET_SIZE_MATCH]:
	# Round up to a multiple of 32 to avoid potential issues
	# TODO: buggs for Depth Anything
	net_width = (inputimages[count].width + 31) // 32 * 32
	net_height = (inputimages[count].height + 31) // 32 * 32
	else:
	net_width = inp[go.NET_WIDTH]
	net_height = inp[go.NET_HEIGHT]
	raw_prediction, raw_prediction_invert = \
	model_holder.get_raw_prediction(inputimages[count], net_width, net_height)

	# output
	if abs(raw_prediction.max() - raw_prediction.min()) > np.finfo("float").eps:
	out = np.copy(raw_prediction)
	# TODO: some models may output negative values, maybe these should be clamped to zero.
	if raw_prediction_invert:
	out *= -1
	if inp[go.DO_OUTPUT_DEPTH_PREDICTION]:
	yield count, 'depth_prediction', np.copy(out)
	if inp[go.CLIPDEPTH]:
	if inp[go.CLIPDEPTH_MODE] == 'Range':
	out = (out - out.min()) / (out.max() - out.min()) # normalize to [0; 1]
	out = np.clip(out, inp[go.CLIPDEPTH_FAR], inp[go.CLIPDEPTH_NEAR])
	elif inp[go.CLIPDEPTH_MODE] == 'Outliers':
	fb, nb = np.percentile(out, [inp[go.CLIPDEPTH_FAR] * 100.0, inp[go.CLIPDEPTH_NEAR] * 100.0])
	out = np.clip(out, fb, nb)
	out = (out - out.min()) / (out.max() - out.min()) # normalize to [0; 1]
	else:
	# Regretfully, the depthmap is broken and will be replaced with a black image
	out = np.zeros(raw_prediction.shape)

	# Maybe we should not use img_output for everything, since we get better accuracy from
	# the raw_prediction. However, it is not always supported. We maybe would like to achieve
	# reproducibility, so depthmap of the image should be the same as generating the depthmap one more time.
	img_output = convert_to_i16(out)
	"""Depthmap (near=bright), as uint16"""

	# if 3dinpainting, store maps for processing in second pass
	if inp[go.GEN_INPAINTED_MESH]:
	inpaint_imgs.append(inputimages[count])
	inpaint_depths.append(img_output)

	# applying background masks after depth
	if inp[go.GEN_REMBG]:
	print('applying background masks')
	background_removed_image = background_removed_images[count]
	# maybe a threshold cut would be better on the line below.
	background_removed_array = np.array(background_removed_image)
	bg_mask = (background_removed_array[:, :, 0] == 0) & (background_removed_array[:, :, 1] == 0) & (
	background_removed_array[:, :, 2] == 0) & (background_removed_array[:, :, 3] <= 0.2)
	img_output[bg_mask] = 0 # far value

	yield count, 'background_removed', background_removed_image

	if inp[go.SAVE_BACKGROUND_REMOVAL_MASKS]:
	bg_array = (1 - bg_mask.astype('int8')) * 255
	mask_array = np.stack((bg_array, bg_array, bg_array, bg_array), axis=2)
	mask_image = Image.fromarray(mask_array.astype(np.uint8))

	yield count, 'foreground_mask', mask_image

	# A weird quirk: if user tries to save depthmap, whereas custom depthmap is used,
	# custom depthmap will be outputed
	if inp[go.DO_OUTPUT_DEPTH]:
	img_depth = cv2.bitwise_not(img_output) if inp[go.OUTPUT_DEPTH_INVERT] else img_output
	if inp[go.OUTPUT_DEPTH_COMBINE]:
	axis = 1 if inp[go.OUTPUT_DEPTH_COMBINE_AXIS] == 'Horizontal' else 0
	img_concat = Image.fromarray(np.concatenate(
	(inputimages[count], convert_i16_to_rgb(img_depth, inputimages[count])),
	axis=axis))
	yield count, 'concat_depth', img_concat
	else:
	yield count, 'depth', Image.fromarray(img_depth)

	if inp[go.GEN_STEREO]:
	# print("Generating stereoscopic image(s)..")
	stereoimages = create_stereoimages(
	inputimages[count], img_output,
	inp[go.STEREO_DIVERGENCE], inp[go.STEREO_SEPARATION],
	inp[go.STEREO_MODES],
	inp[go.STEREO_BALANCE], inp[go.STEREO_OFFSET_EXPONENT], inp[go.STEREO_FILL_ALGO])
	for c in range(0, len(stereoimages)):
	yield count, inp[go.STEREO_MODES][c], stereoimages[c]

	if inp[go.GEN_NORMALMAP]:
	normalmap = create_normalmap(
	img_output,
	inp[go.NORMALMAP_PRE_BLUR_KERNEL] if inp[go.NORMALMAP_PRE_BLUR] else None,
	inp[go.NORMALMAP_SOBEL_KERNEL] if inp[go.NORMALMAP_SOBEL] else None,
	inp[go.NORMALMAP_POST_BLUR_KERNEL] if inp[go.NORMALMAP_POST_BLUR] else None,
	inp[go.NORMALMAP_INVERT]
	)
	yield count, 'normalmap', normalmap

	if inp[go.GEN_HEATMAP]:
	from dzoedepth.utils.misc import colorize
	heatmap = Image.fromarray(colorize(img_output, cmap='inferno'))
	yield count, 'heatmap', heatmap

	# gen mesh
	if inp[go.GEN_SIMPLE_MESH]:
	print(f"\nGenerating (occluded) mesh ..")
	basename = 'depthmap'
	meshsimple_fi = get_uniquefn(outpath, basename, 'obj', 'simple')

	depthi = raw_prediction if raw_prediction is not None else out
	depthi_min, depthi_max = depthi.min(), depthi.max()
	# try to map output to sensible values for non zoedepth models, boost, or custom maps
	if inp[go.MODEL_TYPE] not in [7, 8, 9] or inp[go.BOOST] or inputdepthmaps[count] is not None:
	# invert if midas
	if inp[go.MODEL_TYPE] > 0 or inputdepthmaps[count] is not None: # TODO: Weird
	depthi = depthi_max - depthi + depthi_min
	depth_max = depthi.max()
	depth_min = depthi.min()
	# make positive
	if depthi_min < 0:
	depthi = depthi - depthi_min
	depth_max = depthi.max()
	depth_min = depthi.min()
	# scale down
	if depthi.max() > 10.0:
	depthi = 4.0 * (depthi - depthi_min) / (depthi_max - depthi_min)
	# offset
	depthi = depthi + 1.0

	mesh = create_mesh(inputimages[count], depthi, keep_edges=not inp[go.SIMPLE_MESH_OCCLUDE],
	spherical=(inp[go.SIMPLE_MESH_SPHERICAL]))
	mesh.export(meshsimple_fi)
	yield count, 'simple_mesh', meshsimple_fi

	print("Computing output(s) done.")
	except Exception as e:
	import traceback
	if 'out of memory' in str(e).lower():
	print(str(e))
	suggestion = "out of GPU memory, could not generate depthmap! " \
	"Here are some suggestions to work around this issue:\n"
	if inp[go.BOOST]:
	suggestion += " * Disable BOOST (generation will be faster, but the depthmap will be less detailed)\n"
	if backbone.USED_BACKBONE != backbone.BackboneType.STANDALONE:
	suggestion += " * Run DepthMap in the standalone mode - without launching the SD WebUI\n"
	if device != torch.device("cpu"):
	suggestion += " * Select CPU as the processing device (this will be slower)\n"
	if inp[go.MODEL_TYPE] != 6:
	suggestion +=\
	" * Use a different model (generally, more memory-consuming models produce better depthmaps)\n"
	if not inp[go.BOOST]:
	suggestion += " * Reduce net size (this could reduce quality)\n"
	print('Fail.\n')
	raise Exception(suggestion)
	else:
	print('Fail.\n')
	raise e
	finally:
	if backbone.get_opt('depthmap_script_keepmodels', True):
	model_holder.offload() # Swap to CPU memory
	else:
	model_holder.unload_models()
	gc.collect()
	backbone.torch_gc()

	# TODO: This should not be here
	if inp[go.GEN_INPAINTED_MESH]:
	try:
	mesh_fi = run_3dphoto(device, inpaint_imgs, inpaint_depths, inputnames, outpath,
	inp[go.GEN_INPAINTED_MESH_DEMOS],
	1, "mp4")
	yield 0, 'inpainted_mesh', mesh_fi
	except Exception as e:
	print(f'{str(e)}, some issue with generating inpainted mesh')

	backbone.reload_sd_model()
	print("All done.\n")


	def get_uniquefn(outpath, basename, ext, suffix=''):
	basecount = backbone.get_next_sequence_number(outpath, basename)
	if basecount > 0:
	basecount -= 1
	if suffix != '':
	suffix = f'-{suffix}' # Dash is important for selecting unique filenames (see get_next_sequence_number)
	for i in range(500):
	fullfn = os.path.join(outpath, f"{basename}-{basecount + i:04}{suffix}.{ext}")
	if not os.path.exists(fullfn):
	return fullfn
	return f"{basename}-99999{suffix}.{ext}" # Failback, should never be executed


	def run_3dphoto(device, img_rgb, img_depth, inputnames, outpath, gen_inpainted_mesh_demos, vid_ssaa, vid_format):
	mesh_fi = ''
	try:
	print("Running 3D Photo Inpainting .. ")
	edgemodel_path = './models/3dphoto/edge_model.pth'
	depthmodel_path = './models/3dphoto/depth_model.pth'
	colormodel_path = './models/3dphoto/color_model.pth'
	# create paths to model if not present
	os.makedirs('./models/3dphoto/', exist_ok=True)

	ensure_file_downloaded(
	edgemodel_path,
	["https://huggingface.co/spaces/Epoching/3D_Photo_Inpainting/resolve/e389e564fd2a55cf/checkpoints/edge-model.pth",
	"https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/edge-model.pth"],
	"b1d768bd008ad5fe9f540004f870b8c3d355e4939b2009aa4db493fd313217c9")
	ensure_file_downloaded(
	depthmodel_path,
	["https://huggingface.co/spaces/Epoching/3D_Photo_Inpainting/resolve/e389e564fd2a55cf/checkpoints/depth-model.pth",
	"https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/depth-model.pth"],
	"2d0e63e89a22762ddfa8bc8c9f8c992e5532b140123274ffc6e4171baa1b76f8")
	ensure_file_downloaded(
	colormodel_path,
	["https://huggingface.co/spaces/Epoching/3D_Photo_Inpainting/resolve/e389e564fd2a55cf/checkpoints/color-model.pth",
	"https://filebox.ece.vt.edu/~jbhuang/project/3DPhoto/model/color-model.pth"],
	"383c9b1db70097907a6f9c8abb0303e7056f50d5456a36f34ab784592b8b2c20"
	)

	print("Loading edge model ..")
	depth_edge_model = Inpaint_Edge_Net(init_weights=True)
	depth_edge_weight = torch.load(edgemodel_path, map_location=torch.device(device))
	depth_edge_model.load_state_dict(depth_edge_weight)
	depth_edge_model = depth_edge_model.to(device)
	depth_edge_model.eval()
	print("Loading depth model ..")
	depth_feat_model = Inpaint_Depth_Net()
	depth_feat_weight = torch.load(depthmodel_path, map_location=torch.device(device))
	depth_feat_model.load_state_dict(depth_feat_weight, strict=True)
	depth_feat_model = depth_feat_model.to(device)
	depth_feat_model.eval()
	depth_feat_model = depth_feat_model.to(device)
	print("Loading rgb model ..")
	rgb_model = Inpaint_Color_Net()
	rgb_feat_weight = torch.load(colormodel_path, map_location=torch.device(device))
	rgb_model.load_state_dict(rgb_feat_weight)
	rgb_model.eval()
	rgb_model = rgb_model.to(device)

	config = {}
	config["gpu_ids"] = 0
	config['extrapolation_thickness'] = 60
	config['extrapolate_border'] = True
	config['depth_threshold'] = 0.04
	config['redundant_number'] = 12
	config['ext_edge_threshold'] = 0.002
	config['background_thickness'] = 70
	config['context_thickness'] = 140
	config['background_thickness_2'] = 70
	config['context_thickness_2'] = 70
	config['log_depth'] = True
	config['depth_edge_dilate'] = 10
	config['depth_edge_dilate_2'] = 5
	config['largest_size'] = 512
	config['repeat_inpaint_edge'] = True
	config['ply_fmt'] = "bin"

	config['save_ply'] = backbone.get_opt('depthmap_script_save_ply', False)
	config['save_obj'] = True

	if device == torch.device("cpu"):
	config["gpu_ids"] = -1

	for count in trange(0, len(img_rgb)):
	basename = 'depthmap'
	if inputnames is not None:
	if inputnames[count] is not None:
	p = Path(inputnames[count])
	basename = p.stem

	mesh_fi = get_uniquefn(outpath, basename, 'obj')

	print(f"\nGenerating inpainted mesh .. (go make some coffee) ..")

	# from inpaint.utils.get_MiDaS_samples
	W = img_rgb[count].width
	H = img_rgb[count].height
	int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
	if int_mtx.max() > 1:
	int_mtx[0, :] = int_mtx[0, :] / float(W)
	int_mtx[1, :] = int_mtx[1, :] / float(H)

	# how inpaint.utils.read_MiDaS_depth() imports depthmap
	disp = img_depth[count].astype(np.float32)
	disp = disp - disp.min()
	disp = cv2.blur(disp / disp.max(), ksize=(3, 3)) * disp.max()
	disp = (disp / disp.max()) * 3.0
	depth = 1. / np.maximum(disp, 0.05)

	# rgb input
	img = np.asarray(img_rgb[count])
	if len(img.shape) > 2 and img.shape[2] == 4:
	# convert the image from RGBA2RGB
	img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)

	# run sparse bilateral filter
	config['sparse_iter'] = 5
	config['filter_size'] = [7, 7, 5, 5, 5]
	config['sigma_s'] = 4.0
	config['sigma_r'] = 0.5
	vis_photos, vis_depths = sparse_bilateral_filtering(depth.copy(), img.copy(), config,
	num_iter=config['sparse_iter'], spdb=False)
	depth = vis_depths[-1]

	# bilat_fn = os.path.join(outpath, basename +'_bilatdepth.png')
	# cv2.imwrite(bilat_fn, depth)

	rt_info = write_mesh(img,
	depth,
	int_mtx,
	mesh_fi,
	config,
	rgb_model,
	depth_edge_model,
	depth_edge_model,
	depth_feat_model)

	if rt_info is not False and gen_inpainted_mesh_demos:
	run_3dphoto_videos(mesh_fi, basename, outpath, 300, 40,
	[0.03, 0.03, 0.05, 0.03],
	['double-straight-line', 'double-straight-line', 'circle', 'circle'],
	[0.00, 0.00, -0.015, -0.015],
	[0.00, 0.00, -0.015, -0.00],
	[-0.05, -0.05, -0.05, -0.05],
	['dolly-zoom-in', 'zoom-in', 'circle', 'swing'], False, vid_format, vid_ssaa)

	backbone.torch_gc()

	finally:
	del rgb_model
	rgb_model = None
	del depth_edge_model
	depth_edge_model = None
	del depth_feat_model
	depth_feat_model = None
	backbone.torch_gc()

	return mesh_fi


	def run_3dphoto_videos(mesh_fi, basename, outpath, num_frames, fps, crop_border, traj_types, x_shift_range,
	y_shift_range, z_shift_range, video_postfix, vid_dolly, vid_format, vid_ssaa):
	import vispy
	try:
	if platform.system() == 'Windows':
	vispy.use(app='PyQt5')
	elif platform.system() == 'Darwin':
	vispy.use('PyQt6')
	else:
	vispy.use(app='egl')
	except:
	import traceback
	print(traceback.format_exc())
	print('Trying an alternative...')
	for u in ['PyQt5', 'PyQt6', 'egl']:
	try:
	vispy.use(app=u)
	break
	except:
	print(f'On {u}')
	print(traceback.format_exc())
	# Honestly, I don't know if it actually helps at all

	# read ply
	global video_mesh_data, video_mesh_fn
	if video_mesh_fn is None or video_mesh_fn != mesh_fi:
	try:
	del video_mesh_data
	except:
	print("del video_mesh_data failed")
	video_mesh_fn = mesh_fi
	video_mesh_data = read_mesh(mesh_fi)

	verts, colors, faces, Height, Width, hFov, vFov, mean_loc_depth = video_mesh_data

	original_w = output_w = W = Width
	original_h = output_h = H = Height
	int_mtx = np.array([[max(H, W), 0, W // 2], [0, max(H, W), H // 2], [0, 0, 1]]).astype(np.float32)
	if int_mtx.max() > 1:
	int_mtx[0, :] = int_mtx[0, :] / float(W)
	int_mtx[1, :] = int_mtx[1, :] / float(H)

	config = {}
	config['video_folder'] = outpath
	config['num_frames'] = num_frames
	config['fps'] = fps
	config['crop_border'] = crop_border
	config['traj_types'] = traj_types
	config['x_shift_range'] = x_shift_range
	config['y_shift_range'] = y_shift_range
	config['z_shift_range'] = z_shift_range
	config['video_postfix'] = video_postfix
	config['ssaa'] = vid_ssaa

	# from inpaint.utils.get_MiDaS_samples
	generic_pose = np.eye(4)
	assert len(config['traj_types']) == len(config['x_shift_range']) == \
	len(config['y_shift_range']) == len(config['z_shift_range']) == len(config['video_postfix']), \
	"The number of elements in 'traj_types', 'x_shift_range', 'y_shift_range', 'z_shift_range' and \
	'video_postfix' should be equal."
	tgt_pose = [[generic_pose * 1]]
	tgts_poses = []
	for traj_idx in range(len(config['traj_types'])):
	tgt_poses = []
	sx, sy, sz = path_planning(config['num_frames'], config['x_shift_range'][traj_idx],
	config['y_shift_range'][traj_idx],
	config['z_shift_range'][traj_idx], path_type=config['traj_types'][traj_idx])
	for xx, yy, zz in zip(sx, sy, sz):
	tgt_poses.append(generic_pose * 1.)
	tgt_poses[-1][:3, -1] = np.array([xx, yy, zz])
	tgts_poses += [tgt_poses]
	tgt_pose = generic_pose * 1

	# seems we only need the depthmap to calc mean_loc_depth, which is only used when doing 'dolly'
	# width and height are already in the ply file in the comments ..
	# might try to add the mean_loc_depth to it too
	# did just that
	# mean_loc_depth = img_depth[img_depth.shape[0]//2, img_depth.shape[1]//2]

	print("Generating videos ..")

	normal_canvas, all_canvas = None, None
	videos_poses, video_basename = copy.deepcopy(tgts_poses), basename
	top = (original_h // 2 - int_mtx[1, 2] * output_h)
	left = (original_w // 2 - int_mtx[0, 2] * output_w)
	down, right = top + output_h, left + output_w
	border = [int(xx) for xx in [top, down, left, right]]
	normal_canvas, all_canvas, fn_saved = output_3d_photo(verts.copy(), colors.copy(), faces.copy(),
	copy.deepcopy(Height), copy.deepcopy(Width),
	copy.deepcopy(hFov), copy.deepcopy(vFov),
	copy.deepcopy(tgt_pose), config['video_postfix'],
	copy.deepcopy(generic_pose),
	copy.deepcopy(config['video_folder']),
	None, copy.deepcopy(int_mtx), config, None,
	videos_poses, video_basename, original_h, original_w,
	border=border, depth=None, normal_canvas=normal_canvas,
	all_canvas=all_canvas,
	mean_loc_depth=mean_loc_depth, dolly=vid_dolly,
	fnExt=vid_format)
	return fn_saved

	def run_makevideo(fn_mesh, vid_numframes, vid_fps, vid_traj, vid_shift, vid_border, dolly, vid_format, vid_ssaa,
	outpath=None, basename=None):
	if len(fn_mesh) == 0 or not os.path.exists(fn_mesh):
	raise Exception("Could not open mesh.")

	vid_ssaa = int(vid_ssaa)

	# traj type
	if vid_traj == 0:
	vid_traj = ['straight-line']
	elif vid_traj == 1:
	vid_traj = ['double-straight-line']
	elif vid_traj == 2:
	vid_traj = ['circle']

	num_fps = int(vid_fps)
	num_frames = int(vid_numframes)
	shifts = vid_shift.split(',')
	if len(shifts) != 3:
	raise Exception("Translate requires 3 elements.")
	x_shift_range = [float(shifts[0])]
	y_shift_range = [float(shifts[1])]
	z_shift_range = [float(shifts[2])]

	borders = vid_border.split(',')
	if len(borders) != 4:
	raise Exception("Crop Border requires 4 elements.")
	crop_border = [float(borders[0]), float(borders[1]), float(borders[2]), float(borders[3])]

	if not outpath:
	outpath = backbone.get_outpath()

	if not basename:
	# output path and filename mess ..
	basename = Path(fn_mesh).stem

	# unique filename
	basecount = backbone.get_next_sequence_number(outpath, basename)
	if basecount > 0: basecount = basecount - 1
	fullfn = None
	for i in range(500):
	fn = f"{basecount + i:05}" if basename == '' else f"{basename}-{basecount + i:04}"
	fullfn = os.path.join(outpath, f"{fn}_." + vid_format)
	if not os.path.exists(fullfn):
	break
	basename = Path(fullfn).stem
	basename = basename[:-1]

	print("Loading mesh ..")

	fn_saved = run_3dphoto_videos(fn_mesh, basename, outpath, num_frames, num_fps, crop_border, vid_traj, x_shift_range,
	y_shift_range, z_shift_range, [''], dolly, vid_format, vid_ssaa)

	return fn_saved[-1], fn_saved[-1], ''

	def unload_models():
	model_holder.unload_models()


	# TODO: code borrowed from the internet to be marked as such and to reside in separate files

	def batched_background_removal(inimages, model_name):
	from rembg import new_session, remove
	print('creating background masks')
	outimages = []

	# model path and name
	bg_model_dir = Path.joinpath(Path().resolve(), "models/rem_bg")
	os.makedirs(bg_model_dir, exist_ok=True)
	os.environ["U2NET_HOME"] = str(bg_model_dir)

	# starting a session
	background_removal_session = new_session(model_name)
	for count in range(0, len(inimages)):
	bg_remove_img = np.array(remove(inimages[count], session=background_removal_session))
	outimages.append(Image.fromarray(bg_remove_img))
	# The line below might be redundant
	del background_removal_session
	return outimages


	def pano_depth_to_world_points(depth):
	"""
	360 depth to world points
	given 2D depth is an equirectangular projection of a spherical image
	Treat depth as radius
	longitude : -pi to pi
	latitude : -pi/2 to pi/2
	"""

	# Convert depth to radius
	radius = depth.flatten()

	lon = np.linspace(-np.pi, np.pi, depth.shape[1])
	lat = np.linspace(-np.pi / 2, np.pi / 2, depth.shape[0])

	lon, lat = np.meshgrid(lon, lat)
	lon = lon.flatten()
	lat = lat.flatten()

	# Convert to cartesian coordinates
	x = radius * np.cos(lat) * np.cos(lon)
	y = radius * np.cos(lat) * np.sin(lon)
	z = radius * np.sin(lat)

	pts3d = np.stack([x, y, z], axis=1)

	return pts3d


	def depth_edges_mask(depth):
	"""Returns a mask of edges in the depth map.
	Args:
	depth: 2D numpy array of shape (H, W) with dtype float32.
	Returns:
	mask: 2D numpy array of shape (H, W) with dtype bool.
	"""
	# Compute the x and y gradients of the depth map.
	depth_dx, depth_dy = np.gradient(depth)
	# Compute the gradient magnitude.
	depth_grad = np.sqrt(depth_dx 2 + depth_dy 2)
	# Compute the edge mask.
	mask = depth_grad > 0.05
	return mask


	def create_mesh(image, depth, keep_edges=False, spherical=False):
	import trimesh
	from dzoedepth.utils.geometry import depth_to_points, create_triangles
	maxsize = backbone.get_opt('depthmap_script_mesh_maxsize', 2048)

	# limit the size of the input image
	image.thumbnail((maxsize, maxsize))

	if not spherical:
	pts3d = depth_to_points(depth[None])
	else:
	pts3d = pano_depth_to_world_points(depth)

	pts3d = pts3d.reshape(-1, 3)

	verts = pts3d.reshape(-1, 3)
	image = np.array(image)
	if keep_edges:
	triangles = create_triangles(image.shape[0], image.shape[1])
	else:
	triangles = create_triangles(image.shape[0], image.shape[1], mask=~depth_edges_mask(depth))
	colors = image.reshape(-1, 3)

	mesh = trimesh.Trimesh(vertices=verts, faces=triangles, vertex_colors=colors)

	# rotate 90deg over X when spherical
	if spherical:
	angle = math.pi / 2
	direction = [1, 0, 0]
	center = [0, 0, 0]
	rot_matrix = trimesh.transformations.rotation_matrix(angle, direction, center)
	mesh.apply_transform(rot_matrix)

	return mesh