Spaces:

Stylique
/

Garment3dKabeer

Paused

Garment3dKabeer / NeuralJacobianFields /PoissonSystem.py

Ali Mohsin

more elegent fixes

be97fdc 5 months ago

29.5 kB

	import numpy
	import igl
	import numpy as np
	import torch
	import time

	from scipy.sparse import diags,coo_matrix
	from scipy.sparse import csc_matrix as sp_csc


	USE_TORCH_SPARSE = True ## This uses TORCH_SPARSE instead of TORCH.SPARSE

	# This four are mutually exclusive
	USE_CUPY = False ## This uses CUPY LU decomposition on GPU
	USE_CHOLESPY_GPU = True ## This uses cholesky decomposition on GPU
	USE_CHOLESPY_CPU = False ## This uses cholesky decomposition on CPU
	USE_SCIPY = False ## This uses CUPY LU decomposition on CPU



	# If USE_SCIPY = True, wether or not to use enhanced backend
	USE_SCIKITS_UMFPACK = False ## This uses UMFPACK backend for scipy instead of naive scipy.

	if USE_CHOLESPY_GPU or USE_CHOLESPY_CPU:
	from cholespy import CholeskySolverD, MatrixType

	if USE_CUPY and torch.cuda.is_available():
	from cupyx.scipy.sparse.linalg import spsolve_triangular
	from cupyx.scipy.sparse import csr_matrix
	import cupy
	from torch.utils.dlpack import to_dlpack, from_dlpack

	from scipy.sparse.linalg import splu as scipy_splu
	from scipy.sparse.linalg import spsolve_triangular, spsolve
	if USE_SCIPY:
	if USE_SCIKITS_UMFPACK:
	# This is a bit slower in practice
	# https://stackoverflow.com/questions/64401503/is-there-a-way-to-further-improve-sparse-solution-times-using-python
	from scikits.umfpack import splu as scipy_splu
	else:
	import scipy.sparse.linalg as lg
	lg.use_solver(useUmfpack=False)
	# Slight performance gain with True
	# conda install -c conda-forge scikit-umfpack
	# forward pass goes from 0.038 to 0.036
	# assumeSortedIndices=True Does not bring any boost
	from scipy.sparse.linalg import splu as scipy_splu
	from scipy.sparse.linalg import spsolve_triangular, spsolve


	if USE_TORCH_SPARSE:
	try:
	import torch_sparse
	except ImportError:
	print("Warning: torch_sparse not available, falling back to built-in PyTorch sparse operations")
	USE_TORCH_SPARSE = False


	USE_UGLY_PATCH_FOR_CUPY_ERROR = False


	class SparseMat:
	'''
	Sparse matrix object represented in the COO format
	Refacto : consider killing this object, byproduct of torch_sparse instead of torch.sparse (new feature)
	'''

	@staticmethod
	def from_M(M,ttype):
	return SparseMat(M[0],M[1],M[2],M[3],ttype)

	@staticmethod
	def from_coo(coo,ttype):
	inds = numpy.vstack((coo.row,coo.col))
	return SparseMat(inds,coo.data,coo.shape[0],coo.shape[1],ttype)

	def __init__(self,inds,vals,n,m,ttype):
	self.n = n
	self.m = m
	self.vals = vals
	self.inds = inds
	assert(inds.shape[0] == 2)
	assert(inds.shape[1] == vals.shape[0])
	assert(np.max(inds[0,:]) <= n)
	assert(np.max(inds[1,:] <= m))
	#TODO figure out how to extract the I,J,V,m,n from this, then load a COO mat directly from npz
	#self.coo_mat = coo_matrix((cupy.array(self.vals), (cupy.array(self.inds[0,:]), cupy.array(self.inds[1,:]))))
	self.vals = torch.from_numpy(self.vals).type(ttype).contiguous()
	self.inds = torch.from_numpy(self.inds).type(torch.int64).contiguous()

	def to_coo(self):
	return coo_matrix((self.vals, (self.inds[0,:], self.inds[1,:])), shape = (self.n, self.m))

	def to_csc(self):
	return sp_csc((self.vals, (self.inds[0,:], self.inds[1,:])), shape = (self.n, self.m))

	def to_cholesky(self):
	return CholeskySolverD(self.n, self.inds[0,:], self.inds[1,:], self.vals, MatrixType.COO)

	def to(self,device):
	self.vals = self.vals.to(device)
	self.inds = self.inds.to(device)
	return self

	def pin_memory(self):
	return
	# self.vals.pin_memory()
	# self.inds.pin_memory()

	def multiply_with_dense(self,dense):
	if USE_TORCH_SPARSE:
	res = torch_sparse.spmm(self.inds,self.vals, self.n, self.m, dense)
	# 1000 for loop on the above line takes 0.13 sec. Fast but annoying to have this dependency
	else:
	# Somehow this is not implemented for now?
	# res = torch.smm(torch.sparse_coo_tensor(self.inds,self.vals) , (dense.float())).to_dense().to(dense.device)
	# 1000 for loop on the above line takes 10 sec on the CPU. It is not implemented on gpu yet Slower but no dependency
	if self.vals.device.type == 'cpu':
	tensor_zero_hack = torch.FloatTensor([0]).double() # This line was somehow responsible for a nasty NAN bug
	else:
	tensor_zero_hack = torch.cuda.FloatTensor([0]).to(dense.get_device()).double()
	# beware with addmm, it is experimental and gave me a NaN bug!
	res = torch.sparse.addmm(tensor_zero_hack, torch.sparse_coo_tensor(self.inds.double(),self.vals.double()) , (dense.double())).type_as(self.vals)
	# 1000 for loop on the above line takes 0.77 sec. Slower but no dependency
	return res.contiguous()



	class PoissonSystemMatrices:
	'''
	Holds the matrices needed to perform gradient and poisson computations
	Logic : this class is supposed is supposed to hold everything needed to compute Poisson Solver
	Refacto : merge with Poisson Solver
	Only accept SparseMat representation
	'''
	def __init__(self, V, F,grad, rhs, w, ttype, is_sparse = True, lap = None, cpuonly=False):
	self.dim = 3
	self.is_sparse = is_sparse
	self.w = w
	self.rhs = rhs
	self.igl_grad = grad
	self.ttype = ttype
	self.__splu_L = None
	self.__splu_U = None
	self.__splu_perm_c = None
	self.__splu_perm_r = None
	self.lap = lap
	self.__V = V
	self.__F = F
	self.cpuonly = cpuonly
	self.cpu_splu = None


	def create_poisson_solver(self):
	return PoissonSolver(self.igl_grad,self.w,self.rhs, None, self.lap)

	def create_poisson_solver_from_splu_old(self, lap_L, lap_U, lap_perm_c, lap_perm_r):
	w = torch.from_numpy(self.w).type(self.ttype)
	lap = None
	my_splu = None
	if not self.cpuonly:
	if USE_CUPY:
	my_splu = MyCuSPLU(lap_L, lap_U, lap_perm_c, lap_perm_r)
	else:
	if self.lap is not None:
	lap = self.lap
	# my_splu = scipy_splu(self.lap)
	# my_splu = MyCuSPLU_CPU(lap_L, lap_U, lap_perm_c, lap_perm_r)
	else:
	my_splu = MyCuSPLU_CPU(lap_L, lap_U, lap_perm_c, lap_perm_r)
	# st = time.time()
	# my_splu = scipy_splu(lap_L@lap_U)
	# print(f"time for LU: {time.time() - st}" )

	else:
	if self.lap is not None:
	my_splu = scipy_splu(self.lap)
	else:
	0/0
	# my_splu = splu(lap_L)

	return PoissonSolver(self.igl_grad,w,self.rhs,my_splu, lap)

	def compute_poisson_solver_from_laplacian(self, compute_splu=True):
	self.compute_laplacian()
	if compute_splu:
	self.compute_splu()
	return self.create_poisson_solver_from_splu(self.__splu_L,self.__splu_U,self.__splu_perm_c,self.__splu_perm_r)

	def compute_laplacian(self):
	if self.lap is None:
	self.lap = igl.cotmatrix(self.__V,self.__F)
	self.lap = self.lap[1:, 1:]
	self.lap = SparseMat.from_coo(self.lap.tocoo(), torch.float64)

	if isinstance(self.lap,PoissonSystemMatrices) and self.lap.vals.shape[0] == self.__V.shape[0]:
	assert(False), "this should not happen, the fix is to remove a column and row of the laplacian"
	self.lap = self.lap[1:, 1:]

	return self.lap

	def compute_splu(self):
	print("i am computing splu")
	if self.cpu_splu is None:
	# st = time.time()
	s = scipy_splu(self.lap)
	# print(f"time to compute LU {time.time() - st}")
	# We are storing these attributes just in case we need to create a PoissonSolver on the GPU, they are useless for CPU case.
	self.cpu_splu = s
	self.__splu_L = s.L
	self.__splu_U = s.U
	self.__splu_perm_c = s.perm_c
	self.__splu_perm_r = s.perm_r
	return self.__splu_L,self.__splu_U,self.__splu_perm_c,self.__splu_perm_r

	def get_new_grad(self):
	grad = self.igl_grad.to_coo()
	self.igl_grad = SparseMat.from_M(_convert_sparse_igl_grad_to_our_convention(grad.tocsc()),torch.float64)
	return self.igl_grad

	def _convert_sparse_igl_grad_to_our_convention(input):
	'''
	The grad operator computed from igl.grad() results in a matrix of shape (3*#tri x #verts).
	It is packed such that all the x-coordinates are placed first, followed by y and z. As shown below

	---------- ----------
	\| x1 ... \| x1 ...
	\| x2 ... \| y1 ...
	\| x3 ... \| z1 ...
	\| . \| .
	\| . \| .
	\| y1 ... \| x2 ...
	\| y2 ... ----> \| y2 ...
	\| y3 ... \| z2 ...
	\| . \| .
	\| . \| .
	\| z1 ... \| x3 ...
	\| z2 ... \| y3 ...
	\| z3 ... \| z3 ...
	\| . \| .
	\| . \| .
	---------- ----------

	Note that this functionality cannot be computed trivially if because igl.grad() is a sparse tensor and as such
	slicing is not well defined for sparse matrices. the following code performs the above conversion and returns a
	torch.sparse tensor.
	Set check to True to verify the results by converting the matrices to dense and comparing it.
	'''
	assert type(input) == sp_csc, 'Input should be a scipy csc sparse matrix'
	T = input.tocoo()

	r_c_data = np.hstack((T.row[..., np.newaxis], T.col[..., np.newaxis],
	T.data[..., np.newaxis])) # horizontally stack row, col and data arrays
	r_c_data = r_c_data[r_c_data[:, 0].argsort()] # sort along the row column

	# Separate out x, y and z blocks
	'''
	Note that for the grad operator there are exactly 3 non zero elements in a row
	'''
	L = T.shape[0]
	Tx = r_c_data[:L, :]
	Ty = r_c_data[L:2 * L, :]
	Tz = r_c_data[2 * L:3 * L, :]

	# align the y,z rows with x so that they too start from 0
	Ty[:, 0] -= Ty[0, 0]
	Tz[:, 0] -= Tz[0, 0]

	# 'strech' the x,y,z rows so that they can be interleaved.
	Tx[:, 0] *= 3
	Ty[:, 0] *= 3
	Tz[:, 0] *= 3

	# interleave the y,z into x
	Ty[:, 0] += 1
	Tz[:, 0] += 2

	Tc = np.zeros((input.shape[0] * 3, 3))
	Tc[::3] = Tx
	Tc[1::3] = Ty
	Tc[2::3] = Tz

	indices = Tc[:, :-1].astype(int)
	data = Tc[:, -1]

	return (indices.T, data, input.shape[0], input.shape[1])


	class PoissonSolver:
	'''
	an object to compute gradients and solve poisson
	'''

	def __init__(self,grad,W,rhs,my_splu, lap=None):
	self.W = torch.from_numpy(W).double()
	self.grad = grad
	self.rhs = rhs
	self.my_splu = my_splu
	self.lap = lap
	self.sparse_grad = grad
	self.sparse_rhs = rhs

	def to(self,device):
	self.W = self.W.to(device)
	self.sparse_grad = self.sparse_grad.to(device)
	self.sparse_rhs = self.sparse_rhs.to(device)
	if USE_CUPY or USE_CHOLESPY_GPU:
	self.lap = self.lap.to(device)
	return self

	def jacobians_from_vertices(self,V):
	res = _multiply_sparse_2d_by_dense_3d(self.sparse_grad, V).type_as(V)
	res = res.unsqueeze(2)
	return res.view(V.shape[0], -1, 3,3).transpose(2,3)

	def restrict_jacobians(self,D):
	assert isinstance(D, torch.Tensor) and len(D.shape) in [3, 4]
	assert D.shape[-1] == 3 and D.shape[-2] == 3
	assert isinstance(self.W, torch.Tensor) and len(self.W.shape) == 3
	assert self.W.shape[-1] == 2 and self.W.shape[-2] == 3

	if len(D.shape) == 4:
	DW = torch.einsum("abcd,bde->abce", (D, self.W.type_as(D)))
	else:
	DW = torch.einsum("abcd,bde->abce", (D.unsqueeze(0), self.W)).squeeze(0)

	if len(DW.shape)>4:
	DW = DW.squeeze(0)
	return DW

	def restricted_jacobians_from_vertices(self,V):
	return self.restrict_jacobians(self.jacobians_from_vertices(V))

	def solve_poisson(self,jacobians):
	# st = time.time()
	assert(len(jacobians.shape) == 4)
	assert(jacobians.shape[2] == 3 and jacobians.shape[3] == 3)

	# torch.cuda.synchronize()
	# st = time.time()

	if self.my_splu is None:
	if isinstance(self.lap,SparseMat):
	# self.my_splu = scipy_splu(self.lap.to('cpu').to_coo())
	if USE_CHOLESPY_CPU or USE_CHOLESPY_GPU:
	self.my_splu = self.lap.to_cholesky()
	else:
	self.my_splu = scipy_splu(self.lap.to('cpu').to_coo())
	else:
	self.my_splu = scipy_splu(self.lap)

	# print(f"computing poisson! {self.lap.vals.get_device()}")
	# print(f"computing poisson! {self.lap.inds.get_device()}")
	# print(f"computing poisson! {jacobians.get_device()}")
	# print(f"computing poisson! {self.sparse_rhs.vals.get_device()}")
	# torch.cuda.synchronize()
	# print(f"SOLVER decomposition {time.time() - st}")

	sol = _predicted_jacobians_to_vertices_via_poisson_solve(self.my_splu, self.sparse_rhs, jacobians.transpose(2, 3).reshape(jacobians.shape[0], -1, 3, 1).squeeze(3).contiguous())
	# torch.cuda.synchronize()
	# print(f"POISSON LU + SOLVE FORWARD{time.time() - st}")
	c = torch.mean(sol, axis=1).unsqueeze(1) ## Beware the predicted mesh is centered here.
	# print(f"time for poisson: {time.time() - st}" )
	return sol - c

	def pin_memory(self):
	return
	# self.W.pin_memory()
	# self.sparse_grad.pin_memory()
	# self.sparse_rhs.pin_memory()


	def poisson_system_matrices_from_mesh( V,F, dim=3,ttype = torch.float64, is_sparse=True,cpuonly=False):
	'''
	compute poisson matricees for a given mesh
	:param V vertices
	:param F faces
	:param dim: for now always 3 :)
	:param ttype the type of tensor (e.g., float,double)
	:param is_sparse: for now always true
	:return: a PoissonMatricese object holding the computed matrices
	'''

	assert type(dim) == int and dim in [2,3], f'Only two and three dimensional meshes are supported'
	assert type(is_sparse) == bool
	vertices = V
	faces = F
	dim = 3
	is_sparse = is_sparse

	grad = igl.grad(vertices, faces)
	# grad = np.abs(grad)
	# temp_grad = grad.multiply(csr_matrix(1 / np.sqrt(grad.multiply(grad).sum(1))))
	# gradients_normalized = grad / np.linalg.norm(grad, axis=1)[:, np.newaxis]

	mass = _get_mass_matrix(vertices,faces,is_sparse)
	## TODO 2D Case ##
	if dim == 2:
	grad = grad[:-grad.shape[0]//3,:]
	mass = mass[:-mass.shape[0]//3,:-mass.shape[0]//3]

	laplace = grad.T@mass@grad
	laplace = laplace[1:, 1:]

	rhs = grad.T@mass
	b1,b2,_ = igl.local_basis(V,F)
	w = np.stack((b1,b2),axis=-1)
	# print(time.time() - s)

	rhs = rhs[1:,:]

	if is_sparse:
	laplace = laplace.tocoo()
	rhs = rhs.tocoo()
	grad = grad.tocsc()
	else:
	laplace = laplace.toarray()
	rhs = rhs.toarray()
	grad = grad.toarray()


	grad = SparseMat.from_M(_convert_sparse_igl_grad_to_our_convention(grad), torch.float64)
	poissonbuilder = PoissonSystemMatrices(V=V,F=F,grad=grad,
	rhs=SparseMat.from_coo(rhs, torch.float64), w=w,
	ttype=ttype,is_sparse=is_sparse,
	lap=SparseMat.from_coo(laplace, torch.float64),
	cpuonly=cpuonly)
	# poissonbuilder.get_new_grad()
	return poissonbuilder

	def _get_mass_matrix(vertices,faces,is_sparse):

	d_area = igl.doublearea(vertices,faces)
	d_area = np.hstack((d_area, d_area, d_area))
	if is_sparse:
	return sp_csc(diags(d_area))
	return diags(d_area)




	class SPLUSolveLayer(torch.autograd.Function):
	'''
	Implements the SPLU solve as a differentiable layer, with a forward and backward function
	'''

	@staticmethod
	def forward(ctx, solver, b):
	'''
	override forward function
	:param ctx: context object (to keep the lu object for the backward pass)
	:param lu: splu object
	:param b: right hand side, could be a vector or matrix
	:return: the vector or matrix x which holds lu.solve(b) = x
	'''
	assert isinstance(b, torch.Tensor)
	assert b.shape[-1] >= 1 and b.shape[-1] <= 3, f'got shape {b.shape} expected last dim to be in range 1-3'
	b = b.contiguous()
	ctx.solver = solver

	# st = time.time()
	vertices = SPLUSolveLayer.solve(solver, b).type_as(b)
	# print(f"FORWARD SOLVE {time.time() - st}")

	assert not torch.isnan(vertices).any(), "Nan in the forward pass of the POISSON SOLVE"
	return vertices

	def backward(ctx, grad_output):
	'''
	overrides backward function
	:param grad_output: the gradient to be back-propped
	:return: the outgoing gradient to be back-propped
	'''

	assert isinstance(grad_output, torch.Tensor)
	assert grad_output.shape[-1] >= 1 and grad_output.shape[
	-1] <= 3, f'got shape {grad_output.shape} expected last dim to be in range 1-3'
	# when backpropping, if a layer is linear with matrix M, x ---> Mx, then the backprop of gradient g is M^Tg
	# in our case M = A^{-1}, so the backprop is to solve x = A^-T g.
	# Because A is symmetric we simply solve A^{-1}g without transposing, but this will break if A is not symmetric.
	# st = time.time()
	grad_output = grad_output.contiguous()
	grad = SPLUSolveLayer.solve(ctx.solver,
	grad_output)
	# print(f"BACKWARD SOLVE {time.time() - st}")
	# At this point we perform a NAN check because the backsolve sometimes returns NaNs.
	assert not torch.isnan(grad).any(), "Nan in the backward pass of the POISSON SOLVE"

	if USE_CUPY:
	mempool = cupy.get_default_memory_pool()
	pinned_mempool = cupy.get_default_pinned_memory_pool()
	mempool.free_all_blocks()
	pinned_mempool.free_all_blocks()
	del ctx.lu

	return None, grad

	@staticmethod
	def solve(solver, b):
	'''
	solve the linear system defined by an SPLU object for a given right hand side. if the RHS is a matrix, solution will also be a matrix.
	:param solver: the splu object (LU decomposition) or cholesky object
	:param b: the right hand side to solve for
	:return: solution x which satisfies Ax = b where A is the poisson system lu describes
	'''

	if USE_CUPY:
	b_cupy = cupy.fromDlpack(to_dlpack(b))
	with cupy.cuda.Device(solver.device()):
	# this will hold the solution
	sol = cupy.ndarray(b_cupy.shape)
	for i in range(b_cupy.shape[2]): # b may have multiple columns, solve for each one
	b2d = b_cupy[..., i] # cupy.expand_dims(b_cpu[...,i],2)
	s = solver.solve(b2d.T).T
	sol[:, :, i] = s
	# # # convert back to torch
	res = from_dlpack(sol.toDlpack())
	# np.save("res_gpu.npy", res.cpu().numpy())
	# res = torch.zeros((1, 6889, 3), device=b.device)+ torch.mean(b)

	return res.type_as(b.type())

	elif USE_SCIPY:
	#only CPU
	# st = time.time()
	assert(b.shape[0]==1), "Need to code parrallel implem on the first dim"
	sol = solver.solve(b[0].double().cpu().numpy())
	res = torch.from_numpy(sol).to(b.device).reshape(b.shape)
	# print(time.time() - st)
	return res.type_as(b).contiguous()

	# Legacy code, I don't understand what is the reason for having a for loop
	# sol = np.ndarray(b.shape)
	# for i in range(b.shape[2]): # b may have multiple columns, solve for each one
	# b2d = b[..., i] # cupy.expand_dims(b_cpu[...,i],2)
	# s = lu.solve(b2d.double().cpu().float().numpy().T).T
	# sol[:, :, i] = s
	# res = torch.from_numpy(sol).to(b.device)
	# # np.save("res_cpu.npy", sol)
	# print(f"time {time.time() - st}" )
	elif USE_CHOLESPY_GPU:
	# torch.cuda.synchronize()
	# # st = time.time()
	# assert(b.shape[0]==1), "Need to code parrallel implem on the first dim"
	# b = b.squeeze().double()
	# x = torch.zeros_like(b)
	# solver.solve(b, x)
	# # torch.cuda.synchronize()
	# # print(f"time cholescky GPU {time.time() - st}" )
	# return x.contiguous().unsqueeze(0)
	# st = time.time()
	# print(b.get_device(), b.shape)
	b = b.double().contiguous()
	c = b.permute(1,2,0).contiguous()
	c = c.view(c.shape[0], -1)
	x = torch.zeros_like(c)
	solver.solve(c, x)
	x = x.view(b.shape[1], b.shape[2], b.shape[0])
	x = x.permute(2,0,1).contiguous()
	# torch.cuda.synchronize()
	# print(f"time cholescky GPU {time.time() - st}" )
	return x.contiguous()

	elif USE_CHOLESPY_CPU:
	# st = time.time()
	assert(b.shape[0]==1), "Need to code parrallel implem on the first dim"
	b = b.squeeze()
	b_cpu = b.cpu()
	x = torch.zeros_like(b_cpu)
	solver.solve(b_cpu, x)
	# print(f"time cholescky CPU {time.time() - st}" )
	return x.contiguous().to(b.device).unsqueeze(0)


	return res.type_as(b)

	def _predicted_jacobians_to_vertices_via_poisson_solve(Lap, rhs, jacobians):
	'''
	convert the predictions to the correct convention and feed it to the poisson solve
	'''

	def _batch_rearrange_input(input):
	assert isinstance(input, torch.Tensor) and len(input.shape) in [2, 3]
	P = torch.zeros(input.shape).type_as(input)
	if len(input.shape) == 3:
	# Batched input
	k = input.shape[1] // 3
	P[:, :k, :] = input[:, ::3]
	P[:, k:2 * k, :] = input[:, 1::3]
	P[:, 2 * k:, :] = input[:, 2::3]

	else:
	k = input.shape[0] // 3
	P[:k, :] = input[::3]
	P[k:2 * k, :] = input[1::3]
	P[2 * k:, :] = input[2::3]

	return P

	def _list_rearrange_input(input):
	assert isinstance(input, list) and all([isinstance(x, torch.Tensor) and len(x.shape) in [2, 3] for x in input])
	P = []
	for p in input:
	P.append(_batch_rearrange_input(p))
	return P

	if isinstance(jacobians, list):
	P = _list_rearrange_input(jacobians)
	else:
	P = _batch_rearrange_input(jacobians)

	# return solve_poisson(Lap, rhs, P)
	assert isinstance(P, torch.Tensor) and len(P.shape) in [2, 3]
	assert len(P.shape) == 3

	# torch.cuda.synchronize()
	# st = time.time()
	P = P.double()
	input_to_solve = _multiply_sparse_2d_by_dense_3d(rhs, P)


	out = SPLUSolveLayer.apply(Lap, input_to_solve)

	out = torch.cat([torch.zeros(out.shape[0], 1, out.shape[2]).type_as(out), out], dim=1) ## Why?? Because!
	out = out - torch.mean(out, axis=1, keepdim=True)

	return out.type_as(jacobians)



	def _multiply_sparse_2d_by_dense_3d(mat, B):
	ret = []
	for i in range(B.shape[0]):
	C = mat.multiply_with_dense(B[i, ...])
	ret.append(C)
	ret = torch.stack(tuple(ret))
	return ret







	class MyCuSPLU:
	'''
	implmentation of SPLU on the gpu via CuPy
	'''
	def __init__(self, L, U, perm_c=None, perm_r=None):
	# with cupy.cuda.Device(device):
	self.__orgL = L
	self.__orgU = U
	# self.L = csr_matrix(L)
	# self.U = csr_matrix(U)
	self.L = None
	self.U = None
	self.perm_c = perm_c
	self.perm_r = perm_r
	# self.splu = cu_splu(csr_matrix(lap))
	# self.L = self.splu.L
	# self.U = self.splu.U
	# self.perm_c = self.splu.perm_c
	# self.perm_r = self.splu.perm_r
	self.__device = None

	def to(self, device):
	# assumes to receive a pytorch device object that has a "index" field
	# print(device)
	# if(self.__device is None):
	# raise Exception()
	self.__device = device.index
	with cupy.cuda.Device(self.__device):
	# self.__orgL = cupy.asarray(self.__orgL)
	# self.__orgU = cupy.asarray(self.__orgU)
	self.L = csr_matrix(self.__orgL)
	self.U = csr_matrix(self.__orgU)
	return self

	def device(self):
	return self.__device

	def solve(self, b):
	""" an attempt to use SuperLU data to efficiently solve
	Ax = Pr.T L U Pc.T x = b
	- note that L from SuperLU is in CSC format solving for c
	results in an efficiency warning
	Pr . A . Pc = L . U
	Lc = b - forward solve for c
	c = Ux - then back solve for x
	"""

	assert self.__device is not None, "need to explicitly call to() before solving"
	if USE_UGLY_PATCH_FOR_CUPY_ERROR:
	with cupy.cuda.Device(0):
	b[:1, :1].copy()[:, :1]

	with cupy.cuda.Device(self.__device):
	b = cupy.array(b)
	if self.perm_r is not None:
	b_old = b.copy()
	b[self.perm_r] = b_old

	assert b.device.id == self.__device, "got device" + str(b.device.id) + "instead of" + str(self.__device)
	# st = time.time()
	try: # unit_diagonal is a new kw
	c = spsolve_triangular(self.L, b, lower=True, unit_diagonal=True, overwrite_b=True)
	except TypeError:
	c = spsolve_triangular(self.L, b, lower=True, overwrite_b=True)
	px = spsolve_triangular(self.U, c, lower=False, overwrite_b=True)
	# print(f"time for spsolve_triangular GPU: {time.time() - st}" )

	if self.perm_c is None:
	return px
	px = px[self.perm_c]

	# print(f'used: {mempool.used_bytes()}')
	# print(f'total: {mempool.total_bytes()}')
	return px


	class MyCuSPLU_CPU:
	'''
	implmentation of SPLU on the gpu via CuPy
	'''
	def __init__(self, L, U, perm_c=None, perm_r=None):
	# with cupy.cuda.Device(device):
	self.__orgL = L
	self.__orgU = U
	# self.L = csr_matrix(L)
	# self.U = csr_matrix(U)
	self.L = L
	self.U = U
	# self.L = L.tocsr()
	# self.U = U.tocsr()
	self.perm_c = perm_c
	self.perm_r = perm_r
	# self.splu = cu_splu(csr_matrix(lap))
	# self.L = self.splu.L
	# self.U = self.splu.U
	# self.perm_c = self.splu.perm_c
	# self.perm_r = self.splu.perm_r
	self.__device = 'cpu'

	def to(self, device):
	# assumes to receive a pytorch device object that has a "index" field
	# print(device)
	# if(self.__device is None):
	# raise Exception()
	# self.__device = device.index
	# with cupy.cuda.Device(self.__device):
	# # self.__orgL = cupy.asarray(self.__orgL)
	# # self.__orgU = cupy.asarray(self.__orgU)
	# self.L = csr_matrix(self.__orgL)
	# self.U = csr_matrix(self.__orgU)
	return self

	def device(self):
	return self.__device

	def solve(self, b):
	""" an attempt to use SuperLU data to efficiently solve
	Ax = Pr.T L U Pc.T x = b
	- note that L from SuperLU is in CSC format solving for c
	results in an efficiency warning
	Pr . A . Pc = L . U
	Lc = b - forward solve for c
	c = Ux - then back solve for x
	"""


	# Could be done on GPU
	if self.perm_r is not None:
	b_old = b.copy()
	b[self.perm_r] = b_old
	# , permc_spec="NATURAL"
	# , permc_spec="NATURAL"
	# , permc_spec="NATURAL"
	st = time.time()
	# try: # unit_diagonal is a new kw
	# c = spsolve_triangular(self.L, b, lower=True, unit_diagonal=True, overwrite_b=True)
	# except TypeError:
	# c = spsolve_triangular(self.L, b, lower=True, overwrite_b=True)
	# px = spsolve_triangular(self.U, c, lower=False, overwrite_b=True)
	try: # unit_diagonal is a new kw
	c = spsolve(self.L, b, permc_spec="NATURAL")
	except TypeError:
	c = spsolve(self.L, b, permc_spec="NATURAL")
	px = spsolve(self.U, c, permc_spec="NATURAL")
	# # (self.L * c) - b / np.norm(b)
	print(f"time for spsolve_triangular CPU: {time.time() - st}" )

	if self.perm_c is None:
	return px
	px = px[self.perm_c]

	# print(f'used: {mempool.used_bytes()}')
	# print(f'total: {mempool.total_bytes()}')
	return px

	# return cupy.asnumpy(px)