StableDiffusionVideoTo3D

Runtime error

StableDiffusionVideoTo3D / sgm /modules /encoders /pixelnerf.py

heheyas

init

cfb7702 about 2 years ago

12.9 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.autograd.profiler as profiler
	import numpy as np
	from einops import rearrange, repeat, einsum

	from .math_utils import get_ray_limits_box, linspace

	from ...modules.diffusionmodules.openaimodel import Timestep


	class ImageEncoder(nn.Module):
	def __init__(self, output_dim: int = 64) -> None:
	super().__init__()
	self.output_dim = output_dim

	def forward(self, image):
	return image


	class PositionalEncoding(torch.nn.Module):
	"""
	Implement NeRF's positional encoding
	"""

	def __init__(self, num_freqs=6, d_in=3, freq_factor=np.pi, include_input=True):
	super().__init__()
	self.num_freqs = num_freqs
	self.d_in = d_in
	self.freqs = freq_factor * 2.0 ** torch.arange(0, num_freqs)
	self.d_out = self.num_freqs * 2 * d_in
	self.include_input = include_input
	if include_input:
	self.d_out += d_in
	# f1 f1 f2 f2 ... to multiply x by
	self.register_buffer(
	"_freqs", torch.repeat_interleave(self.freqs, 2).view(1, -1, 1)
	)
	# 0 pi/2 0 pi/2 ... so that
	# (sin(x + _phases[0]), sin(x + _phases[1]) ...) = (sin(x), cos(x)...)
	_phases = torch.zeros(2 * self.num_freqs)
	_phases[1::2] = np.pi * 0.5
	self.register_buffer("_phases", _phases.view(1, -1, 1))

	def forward(self, x):
	"""
	Apply positional encoding (new implementation)
	:param x (batch, self.d_in)
	:return (batch, self.d_out)
	"""
	with profiler.record_function("positional_enc"):
	# embed = x.unsqueeze(1).repeat(1, self.num_freqs * 2, 1)
	embed = repeat(x, "... C -> ... N C", N=self.num_freqs * 2)
	embed = torch.sin(torch.addcmul(self._phases, embed, self._freqs))
	embed = rearrange(embed, "... N C -> ... (N C)")
	if self.include_input:
	embed = torch.cat((x, embed), dim=-1)
	return embed


	class RayGenerator(torch.nn.Module):
	"""
	from camera pose and intrinsics to ray origins and directions
	"""

	def __init__(self):
	super().__init__()
	(
	self.ray_origins_h,
	self.ray_directions,
	self.depths,
	self.image_coords,
	self.rendering_options,
	) = (None, None, None, None, None)

	def forward(self, cam2world_matrix, intrinsics, render_size):
	"""
	Create batches of rays and return origins and directions.

	cam2world_matrix: (N, 4, 4)
	intrinsics: (N, 3, 3)
	render_size: int

	ray_origins: (N, M, 3)
	ray_dirs: (N, M, 2)
	"""

	N, M = cam2world_matrix.shape[0], render_size**2
	cam_locs_world = cam2world_matrix[:, :3, 3]
	fx = intrinsics[:, 0, 0]
	fy = intrinsics[:, 1, 1]
	cx = intrinsics[:, 0, 2]
	cy = intrinsics[:, 1, 2]
	sk = intrinsics[:, 0, 1]

	uv = torch.stack(
	torch.meshgrid(
	torch.arange(
	render_size, dtype=torch.float32, device=cam2world_matrix.device
	),
	torch.arange(
	render_size, dtype=torch.float32, device=cam2world_matrix.device
	),
	indexing="ij",
	)
	)
	uv = uv.flip(0).reshape(2, -1).transpose(1, 0)
	uv = uv.unsqueeze(0).repeat(cam2world_matrix.shape[0], 1, 1)

	x_cam = uv[:, :, 0].view(N, -1) * (1.0 / render_size) + (0.5 / render_size)
	y_cam = uv[:, :, 1].view(N, -1) * (1.0 / render_size) + (0.5 / render_size)
	z_cam = torch.ones((N, M), device=cam2world_matrix.device)

	x_lift = (
	(
	x_cam
	- cx.unsqueeze(-1)
	+ cy.unsqueeze(-1) * sk.unsqueeze(-1) / fy.unsqueeze(-1)
	- sk.unsqueeze(-1) * y_cam / fy.unsqueeze(-1)
	)
	/ fx.unsqueeze(-1)
	* z_cam
	)
	y_lift = (y_cam - cy.unsqueeze(-1)) / fy.unsqueeze(-1) * z_cam

	cam_rel_points = torch.stack(
	(x_lift, y_lift, z_cam, torch.ones_like(z_cam)), dim=-1
	)

	# NOTE: this should be named _blender2opencv
	_opencv2blender = (
	torch.tensor(
	[
	[1, 0, 0, 0],
	[0, -1, 0, 0],
	[0, 0, -1, 0],
	[0, 0, 0, 1],
	],
	dtype=torch.float32,
	device=cam2world_matrix.device,
	)
	.unsqueeze(0)
	.repeat(N, 1, 1)
	)

	cam2world_matrix = torch.bmm(cam2world_matrix, _opencv2blender)

	world_rel_points = torch.bmm(
	cam2world_matrix, cam_rel_points.permute(0, 2, 1)
	).permute(0, 2, 1)[:, :, :3]

	ray_dirs = world_rel_points - cam_locs_world[:, None, :]
	ray_dirs = torch.nn.functional.normalize(ray_dirs, dim=2)

	ray_origins = cam_locs_world.unsqueeze(1).repeat(1, ray_dirs.shape[1], 1)

	return ray_origins, ray_dirs


	class RaySampler(torch.nn.Module):
	def __init__(
	self,
	num_samples_per_ray,
	bbox_length=1.0,
	near=0.5,
	far=10000.0,
	disparity=False,
	):
	super().__init__()
	self.num_samples_per_ray = num_samples_per_ray
	self.bbox_length = bbox_length
	self.near = near
	self.far = far
	self.disparity = disparity

	def forward(self, ray_origins, ray_directions):
	if not self.disparity:
	t_start, t_end = get_ray_limits_box(
	ray_origins, ray_directions, 2 * self.bbox_length
	)
	else:
	t_start = torch.full_like(ray_origins, self.near)
	t_end = torch.full_like(ray_origins, self.far)
	is_ray_valid = t_end > t_start
	if torch.any(is_ray_valid).item():
	t_start[~is_ray_valid] = t_start[is_ray_valid].min()
	t_end[~is_ray_valid] = t_start[is_ray_valid].max()

	if not self.disparity:
	depths = linspace(t_start, t_end, self.num_samples_per_ray)
	depths += (
	torch.rand_like(depths)
	* (t_end - t_start)
	/ (self.num_samples_per_ray - 1)
	)
	else:
	step = 1.0 / self.num_samples_per_ray
	z_steps = torch.linspace(
	0, 1 - step, self.num_samples_per_ray, device=ray_origins.device
	)
	z_steps += torch.rand_like(z_steps) * step
	depths = 1 / (1 / self.near * (1 - z_steps) + 1 / self.far * z_steps)
	depths = depths[..., None, None, None]

	return ray_origins[None] + ray_directions[None] * depths


	class PixelNeRF(torch.nn.Module):
	def __init__(
	self,
	num_samples_per_ray: int = 128,
	feature_dim: int = 64,
	interp: str = "bilinear",
	padding: str = "border",
	disparity: bool = False,
	near: float = 0.5,
	far: float = 10000.0,
	use_feats_std: bool = False,
	use_pos_emb: bool = False,
	) -> None:
	super().__init__()
	# self.positional_encoder = Timestep(3) # TODO
	self.num_samples_per_ray = num_samples_per_ray
	self.ray_generator = RayGenerator()
	self.ray_sampler = RaySampler(
	num_samples_per_ray, near=near, far=far, disparity=disparity
	) # TODO
	self.interp = interp
	self.padding = padding

	self.positional_encoder = PositionalEncoding()

	# self.feature_aggregator = nn.Linear(128, 129) # TODO
	self.use_feats_std = use_feats_std
	self.use_pos_emb = use_pos_emb
	d_in = feature_dim
	if use_feats_std:
	d_in += feature_dim
	if use_pos_emb:
	d_in += self.positional_encoder.d_out
	self.feature_aggregator = nn.Sequential(
	nn.Linear(d_in, 128),
	nn.ReLU(),
	nn.Linear(128, 128),
	nn.ReLU(),
	nn.Linear(128, 129),
	)

	# self.decoder = nn.Linear(128, 131) # TODO
	self.decoder = nn.Sequential(
	nn.Linear(128, 128),
	nn.ReLU(),
	nn.Linear(128, 128),
	nn.ReLU(),
	nn.Linear(128, 131),
	)

	def project(self, ray_samples, source_c2ws, source_instrincs):
	# TODO: implement
	# S for number of source cameras
	# ray_samples: [B, N, H * W, N_sample, 3]
	# source_c2ws: [B, S, 4, 4]
	# source_intrinsics: [B, S, 3, 3]
	# return [B, S, N, H * W, N_sample, 2]
	S = source_c2ws.shape[1]
	B = ray_samples.shape[0]
	N = ray_samples.shape[1]
	HW = ray_samples.shape[2]
	ray_samples = repeat(
	ray_samples,
	"B N HW N_sample C -> B S N HW N_sample C",
	S=source_c2ws.shape[1],
	)
	padding = torch.ones((B, S, N, HW, self.num_samples_per_ray, 1)).to(ray_samples)
	ray_samples_homo = torch.cat([ray_samples, padding], dim=-1)
	source_c2ws = repeat(source_c2ws, "B S C1 C2 -> B S N 1 1 C1 C2", N=N)
	source_instrincs = repeat(source_instrincs, "B S C1 C2 -> B S N 1 1 C1 C2", N=N)
	source_w2c = source_c2ws.inverse()
	projected_samples = einsum(
	source_w2c, ray_samples_homo, "... i j, ... j -> ... i"
	)[..., :3]
	# NOTE: assumes opengl convention
	projected_samples = -1 * projected_samples[..., :2] / projected_samples[..., 2:]
	# NOTE: intrinsics here are normalized by resolution
	fx = source_instrincs[..., 0, 0]
	fy = source_instrincs[..., 1, 1]
	cx = source_instrincs[..., 0, 2]
	cy = source_instrincs[..., 1, 2]
	x = projected_samples[..., 0] * fx + cx
	# negative sign here is caused by opengl, F.grid_sample is consistent with openCV convention
	y = -projected_samples[..., 1] * fy + cy

	return torch.stack([x, y], dim=-1)

	def forward(
	self, image_feats, source_c2ws, source_intrinsics, c2ws, intrinsics, render_size
	):
	# image_feats: [B S C H W]
	B = c2ws.shape[0]
	T = c2ws.shape[1]
	ray_origins, ray_directions = self.ray_generator(
	c2ws.reshape(-1, 4, 4), intrinsics.reshape(-1, 3, 3), render_size
	) # [B * N, H * W, 3]
	# breakpoint()

	ray_samples = self.ray_sampler(
	ray_origins, ray_directions
	) # [N_sample, B * N, H * W, 3]
	ray_samples = rearrange(ray_samples, "Ns (B N) HW C -> B N HW Ns C", B=B)

	projected_samples = self.project(ray_samples, source_c2ws, source_intrinsics)
	# # debug
	# p = projected_samples[:, :, 0, :, 0, :]
	# p = p.reshape(p.shape[0] * p.shape[1], *p.shape[2:])

	# breakpoint()

	# image_feats = repeat(image_feats, "B S C H W -> (B S N) C H W", N=T)
	image_feats = rearrange(image_feats, "B S C H W -> (B S) C H W")
	projected_samples = rearrange(
	projected_samples, "B S N HW Ns xy -> (B S) (N Ns) HW xy"
	)
	# make sure the projected samples are in the range of [-1, 1], as required by F.grid_sample
	joint = F.grid_sample(
	image_feats,
	projected_samples * 2.0 - 1.0,
	padding_mode=self.padding,
	mode=self.interp,
	align_corners=True,
	)
	# print("image_feats", image_feats.max(), image_feats.min())
	# print("samples", projected_samples.max(), projected_samples.min())
	joint = rearrange(
	joint,
	"(B S) C (N Ns) HW -> B S N HW Ns C",
	B=B,
	Ns=self.num_samples_per_ray,
	)

	reduced = torch.mean(joint, dim=1) # reduce on source dimension
	if self.use_feats_std:
	if not joint.shape[1] == 1:
	reduced = torch.cat((reduced, joint.std(dim=1)), dim=-1)
	else:
	reduced = torch.cat((reduced, torch.zeros_like(reduced)), dim=-1)

	if self.use_pos_emb:
	reduced = torch.cat((reduced, self.positional_encoder(ray_samples)), dim=-1)
	reduced = self.feature_aggregator(reduced)

	feats, weights = reduced.split([reduced.shape[-1] - 1, 1], dim=-1)
	# feats: [B, N, H * W, N_samples, N_c]
	# weights: [B, N, H * W, N_samples, 1]
	weights = F.softmax(weights, dim=-2)

	feats = torch.sum(feats * weights, dim=-2)

	rgb, feats = self.decoder(feats).split([3, 128], dim=-1)

	rgb = F.sigmoid(rgb)
	rgb = rearrange(rgb, "B N (H W) C -> B N C H W", H=render_size)
	feats = rearrange(feats, "B N (H W) C -> B N C H W", H=render_size)

	# print(rgb.max(), rgb.min())
	# print(feats.max(), feats.min())

	return rgb, feats