Spaces:
Paused
Paused
| import numpy | |
| import igl | |
| import numpy as np | |
| import torch | |
| import time | |
| from scipy.sparse import diags,coo_matrix | |
| from scipy.sparse import csc_matrix as sp_csc | |
| USE_TORCH_SPARSE = True ## This uses TORCH_SPARSE instead of TORCH.SPARSE | |
| # This four are mutually exclusive | |
| USE_CUPY = False ## This uses CUPY LU decomposition on GPU | |
| USE_CHOLESPY_GPU = True ## This uses cholesky decomposition on GPU | |
| USE_CHOLESPY_CPU = False ## This uses cholesky decomposition on CPU | |
| USE_SCIPY = False ## This uses CUPY LU decomposition on CPU | |
| # If USE_SCIPY = True, wether or not to use enhanced backend | |
| USE_SCIKITS_UMFPACK = False ## This uses UMFPACK backend for scipy instead of naive scipy. | |
| if USE_CHOLESPY_GPU or USE_CHOLESPY_CPU: | |
| from cholespy import CholeskySolverD, MatrixType | |
| if USE_CUPY and torch.cuda.is_available(): | |
| from cupyx.scipy.sparse.linalg import spsolve_triangular | |
| from cupyx.scipy.sparse import csr_matrix | |
| import cupy | |
| from torch.utils.dlpack import to_dlpack, from_dlpack | |
| from scipy.sparse.linalg import splu as scipy_splu | |
| from scipy.sparse.linalg import spsolve_triangular, spsolve | |
| if USE_SCIPY: | |
| if USE_SCIKITS_UMFPACK: | |
| # This is a bit slower in practice | |
| # https://stackoverflow.com/questions/64401503/is-there-a-way-to-further-improve-sparse-solution-times-using-python | |
| from scikits.umfpack import splu as scipy_splu | |
| else: | |
| import scipy.sparse.linalg as lg | |
| lg.use_solver(useUmfpack=False) | |
| # Slight performance gain with True | |
| # conda install -c conda-forge scikit-umfpack | |
| # forward pass goes from 0.038 to 0.036 | |
| # assumeSortedIndices=True Does not bring any boost | |
| from scipy.sparse.linalg import splu as scipy_splu | |
| from scipy.sparse.linalg import spsolve_triangular, spsolve | |
| if USE_TORCH_SPARSE: | |
| try: | |
| import torch_sparse | |
| except ImportError: | |
| print("Warning: torch_sparse not available, falling back to built-in PyTorch sparse operations") | |
| USE_TORCH_SPARSE = False | |
| USE_UGLY_PATCH_FOR_CUPY_ERROR = False | |
| class SparseMat: | |
| ''' | |
| Sparse matrix object represented in the COO format | |
| Refacto : consider killing this object, byproduct of torch_sparse instead of torch.sparse (new feature) | |
| ''' | |
| def from_M(M,ttype): | |
| return SparseMat(M[0],M[1],M[2],M[3],ttype) | |
| def from_coo(coo,ttype): | |
| inds = numpy.vstack((coo.row,coo.col)) | |
| return SparseMat(inds,coo.data,coo.shape[0],coo.shape[1],ttype) | |
| def __init__(self,inds,vals,n,m,ttype): | |
| self.n = n | |
| self.m = m | |
| self.vals = vals | |
| self.inds = inds | |
| assert(inds.shape[0] == 2) | |
| assert(inds.shape[1] == vals.shape[0]) | |
| assert(np.max(inds[0,:]) <= n) | |
| assert(np.max(inds[1,:] <= m)) | |
| #TODO figure out how to extract the I,J,V,m,n from this, then load a COO mat directly from npz | |
| #self.coo_mat = coo_matrix((cupy.array(self.vals), (cupy.array(self.inds[0,:]), cupy.array(self.inds[1,:])))) | |
| self.vals = torch.from_numpy(self.vals).type(ttype).contiguous() | |
| self.inds = torch.from_numpy(self.inds).type(torch.int64).contiguous() | |
| def to_coo(self): | |
| return coo_matrix((self.vals, (self.inds[0,:], self.inds[1,:])), shape = (self.n, self.m)) | |
| def to_csc(self): | |
| return sp_csc((self.vals, (self.inds[0,:], self.inds[1,:])), shape = (self.n, self.m)) | |
| def to_cholesky(self): | |
| return CholeskySolverD(self.n, self.inds[0,:], self.inds[1,:], self.vals, MatrixType.COO) | |
| def to(self,device): | |
| self.vals = self.vals.to(device) | |
| self.inds = self.inds.to(device) | |
| return self | |
| def pin_memory(self): | |
| return | |
| # self.vals.pin_memory() | |
| # self.inds.pin_memory() | |
| def multiply_with_dense(self,dense): | |
| if USE_TORCH_SPARSE: | |
| res = torch_sparse.spmm(self.inds,self.vals, self.n, self.m, dense) | |
| # 1000 for loop on the above line takes 0.13 sec. Fast but annoying to have this dependency | |
| else: | |
| # Somehow this is not implemented for now? | |
| # res = torch.smm(torch.sparse_coo_tensor(self.inds,self.vals) , (dense.float())).to_dense().to(dense.device) | |
| # 1000 for loop on the above line takes 10 sec on the CPU. It is not implemented on gpu yet Slower but no dependency | |
| if self.vals.device.type == 'cpu': | |
| tensor_zero_hack = torch.FloatTensor([0]).double() # This line was somehow responsible for a nasty NAN bug | |
| else: | |
| tensor_zero_hack = torch.cuda.FloatTensor([0]).to(dense.get_device()).double() | |
| # beware with addmm, it is experimental and gave me a NaN bug! | |
| res = torch.sparse.addmm(tensor_zero_hack, torch.sparse_coo_tensor(self.inds.double(),self.vals.double()) , (dense.double())).type_as(self.vals) | |
| # 1000 for loop on the above line takes 0.77 sec. Slower but no dependency | |
| return res.contiguous() | |
| class PoissonSystemMatrices: | |
| ''' | |
| Holds the matrices needed to perform gradient and poisson computations | |
| Logic : this class is supposed is supposed to hold everything needed to compute Poisson Solver | |
| Refacto : merge with Poisson Solver | |
| Only accept SparseMat representation | |
| ''' | |
| def __init__(self, V, F,grad, rhs, w, ttype, is_sparse = True, lap = None, cpuonly=False): | |
| self.dim = 3 | |
| self.is_sparse = is_sparse | |
| self.w = w | |
| self.rhs = rhs | |
| self.igl_grad = grad | |
| self.ttype = ttype | |
| self.__splu_L = None | |
| self.__splu_U = None | |
| self.__splu_perm_c = None | |
| self.__splu_perm_r = None | |
| self.lap = lap | |
| self.__V = V | |
| self.__F = F | |
| self.cpuonly = cpuonly | |
| self.cpu_splu = None | |
| def create_poisson_solver(self): | |
| return PoissonSolver(self.igl_grad,self.w,self.rhs, None, self.lap) | |
| def create_poisson_solver_from_splu_old(self, lap_L, lap_U, lap_perm_c, lap_perm_r): | |
| w = torch.from_numpy(self.w).type(self.ttype) | |
| lap = None | |
| my_splu = None | |
| if not self.cpuonly: | |
| if USE_CUPY: | |
| my_splu = MyCuSPLU(lap_L, lap_U, lap_perm_c, lap_perm_r) | |
| else: | |
| if self.lap is not None: | |
| lap = self.lap | |
| # my_splu = scipy_splu(self.lap) | |
| # my_splu = MyCuSPLU_CPU(lap_L, lap_U, lap_perm_c, lap_perm_r) | |
| else: | |
| my_splu = MyCuSPLU_CPU(lap_L, lap_U, lap_perm_c, lap_perm_r) | |
| # st = time.time() | |
| # my_splu = scipy_splu(lap_L@lap_U) | |
| # print(f"time for LU: {time.time() - st}" ) | |
| else: | |
| if self.lap is not None: | |
| my_splu = scipy_splu(self.lap) | |
| else: | |
| 0/0 | |
| # my_splu = splu(lap_L) | |
| return PoissonSolver(self.igl_grad,w,self.rhs,my_splu, lap) | |
| def compute_poisson_solver_from_laplacian(self, compute_splu=True): | |
| self.compute_laplacian() | |
| if compute_splu: | |
| self.compute_splu() | |
| return self.create_poisson_solver_from_splu(self.__splu_L,self.__splu_U,self.__splu_perm_c,self.__splu_perm_r) | |
| def compute_laplacian(self): | |
| if self.lap is None: | |
| self.lap = igl.cotmatrix(self.__V,self.__F) | |
| self.lap = self.lap[1:, 1:] | |
| self.lap = SparseMat.from_coo(self.lap.tocoo(), torch.float64) | |
| if isinstance(self.lap,PoissonSystemMatrices) and self.lap.vals.shape[0] == self.__V.shape[0]: | |
| assert(False), "this should not happen, the fix is to remove a column and row of the laplacian" | |
| self.lap = self.lap[1:, 1:] | |
| return self.lap | |
| def compute_splu(self): | |
| print("i am computing splu") | |
| if self.cpu_splu is None: | |
| # st = time.time() | |
| s = scipy_splu(self.lap) | |
| # print(f"time to compute LU {time.time() - st}") | |
| # We are storing these attributes just in case we need to create a PoissonSolver on the GPU, they are useless for CPU case. | |
| self.cpu_splu = s | |
| self.__splu_L = s.L | |
| self.__splu_U = s.U | |
| self.__splu_perm_c = s.perm_c | |
| self.__splu_perm_r = s.perm_r | |
| return self.__splu_L,self.__splu_U,self.__splu_perm_c,self.__splu_perm_r | |
| def get_new_grad(self): | |
| grad = self.igl_grad.to_coo() | |
| self.igl_grad = SparseMat.from_M(_convert_sparse_igl_grad_to_our_convention(grad.tocsc()),torch.float64) | |
| return self.igl_grad | |
| def _convert_sparse_igl_grad_to_our_convention(input): | |
| ''' | |
| The grad operator computed from igl.grad() results in a matrix of shape (3*#tri x #verts). | |
| It is packed such that all the x-coordinates are placed first, followed by y and z. As shown below | |
| ---------- ---------- | |
| | x1 ... | x1 ... | |
| | x2 ... | y1 ... | |
| | x3 ... | z1 ... | |
| | . | . | |
| | . | . | |
| | y1 ... | x2 ... | |
| | y2 ... ----> | y2 ... | |
| | y3 ... | z2 ... | |
| | . | . | |
| | . | . | |
| | z1 ... | x3 ... | |
| | z2 ... | y3 ... | |
| | z3 ... | z3 ... | |
| | . | . | |
| | . | . | |
| ---------- ---------- | |
| Note that this functionality cannot be computed trivially if because igl.grad() is a sparse tensor and as such | |
| slicing is not well defined for sparse matrices. the following code performs the above conversion and returns a | |
| torch.sparse tensor. | |
| Set check to True to verify the results by converting the matrices to dense and comparing it. | |
| ''' | |
| assert type(input) == sp_csc, 'Input should be a scipy csc sparse matrix' | |
| T = input.tocoo() | |
| r_c_data = np.hstack((T.row[..., np.newaxis], T.col[..., np.newaxis], | |
| T.data[..., np.newaxis])) # horizontally stack row, col and data arrays | |
| r_c_data = r_c_data[r_c_data[:, 0].argsort()] # sort along the row column | |
| # Separate out x, y and z blocks | |
| ''' | |
| Note that for the grad operator there are exactly 3 non zero elements in a row | |
| ''' | |
| L = T.shape[0] | |
| Tx = r_c_data[:L, :] | |
| Ty = r_c_data[L:2 * L, :] | |
| Tz = r_c_data[2 * L:3 * L, :] | |
| # align the y,z rows with x so that they too start from 0 | |
| Ty[:, 0] -= Ty[0, 0] | |
| Tz[:, 0] -= Tz[0, 0] | |
| # 'strech' the x,y,z rows so that they can be interleaved. | |
| Tx[:, 0] *= 3 | |
| Ty[:, 0] *= 3 | |
| Tz[:, 0] *= 3 | |
| # interleave the y,z into x | |
| Ty[:, 0] += 1 | |
| Tz[:, 0] += 2 | |
| Tc = np.zeros((input.shape[0] * 3, 3)) | |
| Tc[::3] = Tx | |
| Tc[1::3] = Ty | |
| Tc[2::3] = Tz | |
| indices = Tc[:, :-1].astype(int) | |
| data = Tc[:, -1] | |
| return (indices.T, data, input.shape[0], input.shape[1]) | |
| class PoissonSolver: | |
| ''' | |
| an object to compute gradients and solve poisson | |
| ''' | |
| def __init__(self,grad,W,rhs,my_splu, lap=None): | |
| self.W = torch.from_numpy(W).double() | |
| self.grad = grad | |
| self.rhs = rhs | |
| self.my_splu = my_splu | |
| self.lap = lap | |
| self.sparse_grad = grad | |
| self.sparse_rhs = rhs | |
| def to(self,device): | |
| self.W = self.W.to(device) | |
| self.sparse_grad = self.sparse_grad.to(device) | |
| self.sparse_rhs = self.sparse_rhs.to(device) | |
| if USE_CUPY or USE_CHOLESPY_GPU: | |
| self.lap = self.lap.to(device) | |
| return self | |
| def jacobians_from_vertices(self,V): | |
| res = _multiply_sparse_2d_by_dense_3d(self.sparse_grad, V).type_as(V) | |
| res = res.unsqueeze(2) | |
| return res.view(V.shape[0], -1, 3,3).transpose(2,3) | |
| def restrict_jacobians(self,D): | |
| assert isinstance(D, torch.Tensor) and len(D.shape) in [3, 4] | |
| assert D.shape[-1] == 3 and D.shape[-2] == 3 | |
| assert isinstance(self.W, torch.Tensor) and len(self.W.shape) == 3 | |
| assert self.W.shape[-1] == 2 and self.W.shape[-2] == 3 | |
| if len(D.shape) == 4: | |
| DW = torch.einsum("abcd,bde->abce", (D, self.W.type_as(D))) | |
| else: | |
| DW = torch.einsum("abcd,bde->abce", (D.unsqueeze(0), self.W)).squeeze(0) | |
| if len(DW.shape)>4: | |
| DW = DW.squeeze(0) | |
| return DW | |
| def restricted_jacobians_from_vertices(self,V): | |
| return self.restrict_jacobians(self.jacobians_from_vertices(V)) | |
| def solve_poisson(self,jacobians): | |
| # st = time.time() | |
| assert(len(jacobians.shape) == 4) | |
| assert(jacobians.shape[2] == 3 and jacobians.shape[3] == 3) | |
| # torch.cuda.synchronize() | |
| # st = time.time() | |
| if self.my_splu is None: | |
| if isinstance(self.lap,SparseMat): | |
| # self.my_splu = scipy_splu(self.lap.to('cpu').to_coo()) | |
| if USE_CHOLESPY_CPU or USE_CHOLESPY_GPU: | |
| self.my_splu = self.lap.to_cholesky() | |
| else: | |
| self.my_splu = scipy_splu(self.lap.to('cpu').to_coo()) | |
| else: | |
| self.my_splu = scipy_splu(self.lap) | |
| # print(f"computing poisson! {self.lap.vals.get_device()}") | |
| # print(f"computing poisson! {self.lap.inds.get_device()}") | |
| # print(f"computing poisson! {jacobians.get_device()}") | |
| # print(f"computing poisson! {self.sparse_rhs.vals.get_device()}") | |
| # torch.cuda.synchronize() | |
| # print(f"SOLVER decomposition {time.time() - st}") | |
| sol = _predicted_jacobians_to_vertices_via_poisson_solve(self.my_splu, self.sparse_rhs, jacobians.transpose(2, 3).reshape(jacobians.shape[0], -1, 3, 1).squeeze(3).contiguous()) | |
| # torch.cuda.synchronize() | |
| # print(f"POISSON LU + SOLVE FORWARD{time.time() - st}") | |
| c = torch.mean(sol, axis=1).unsqueeze(1) ## Beware the predicted mesh is centered here. | |
| # print(f"time for poisson: {time.time() - st}" ) | |
| return sol - c | |
| def pin_memory(self): | |
| return | |
| # self.W.pin_memory() | |
| # self.sparse_grad.pin_memory() | |
| # self.sparse_rhs.pin_memory() | |
| def poisson_system_matrices_from_mesh( V,F, dim=3,ttype = torch.float64, is_sparse=True,cpuonly=False): | |
| ''' | |
| compute poisson matricees for a given mesh | |
| :param V vertices | |
| :param F faces | |
| :param dim: for now always 3 :) | |
| :param ttype the type of tensor (e.g., float,double) | |
| :param is_sparse: for now always true | |
| :return: a PoissonMatricese object holding the computed matrices | |
| ''' | |
| assert type(dim) == int and dim in [2,3], f'Only two and three dimensional meshes are supported' | |
| assert type(is_sparse) == bool | |
| vertices = V | |
| faces = F | |
| dim = 3 | |
| is_sparse = is_sparse | |
| grad = igl.grad(vertices, faces) | |
| # grad = np.abs(grad) | |
| # temp_grad = grad.multiply(csr_matrix(1 / np.sqrt(grad.multiply(grad).sum(1)))) | |
| # gradients_normalized = grad / np.linalg.norm(grad, axis=1)[:, np.newaxis] | |
| mass = _get_mass_matrix(vertices,faces,is_sparse) | |
| ## TODO 2D Case ## | |
| if dim == 2: | |
| grad = grad[:-grad.shape[0]//3,:] | |
| mass = mass[:-mass.shape[0]//3,:-mass.shape[0]//3] | |
| laplace = grad.T@mass@grad | |
| laplace = laplace[1:, 1:] | |
| rhs = grad.T@mass | |
| b1,b2,_ = igl.local_basis(V,F) | |
| w = np.stack((b1,b2),axis=-1) | |
| # print(time.time() - s) | |
| rhs = rhs[1:,:] | |
| if is_sparse: | |
| laplace = laplace.tocoo() | |
| rhs = rhs.tocoo() | |
| grad = grad.tocsc() | |
| else: | |
| laplace = laplace.toarray() | |
| rhs = rhs.toarray() | |
| grad = grad.toarray() | |
| grad = SparseMat.from_M(_convert_sparse_igl_grad_to_our_convention(grad), torch.float64) | |
| poissonbuilder = PoissonSystemMatrices(V=V,F=F,grad=grad, | |
| rhs=SparseMat.from_coo(rhs, torch.float64), w=w, | |
| ttype=ttype,is_sparse=is_sparse, | |
| lap=SparseMat.from_coo(laplace, torch.float64), | |
| cpuonly=cpuonly) | |
| # poissonbuilder.get_new_grad() | |
| return poissonbuilder | |
| def _get_mass_matrix(vertices,faces,is_sparse): | |
| d_area = igl.doublearea(vertices,faces) | |
| d_area = np.hstack((d_area, d_area, d_area)) | |
| if is_sparse: | |
| return sp_csc(diags(d_area)) | |
| return diags(d_area) | |
| class SPLUSolveLayer(torch.autograd.Function): | |
| ''' | |
| Implements the SPLU solve as a differentiable layer, with a forward and backward function | |
| ''' | |
| def forward(ctx, solver, b): | |
| ''' | |
| override forward function | |
| :param ctx: context object (to keep the lu object for the backward pass) | |
| :param lu: splu object | |
| :param b: right hand side, could be a vector or matrix | |
| :return: the vector or matrix x which holds lu.solve(b) = x | |
| ''' | |
| assert isinstance(b, torch.Tensor) | |
| assert b.shape[-1] >= 1 and b.shape[-1] <= 3, f'got shape {b.shape} expected last dim to be in range 1-3' | |
| b = b.contiguous() | |
| ctx.solver = solver | |
| # st = time.time() | |
| vertices = SPLUSolveLayer.solve(solver, b).type_as(b) | |
| # print(f"FORWARD SOLVE {time.time() - st}") | |
| assert not torch.isnan(vertices).any(), "Nan in the forward pass of the POISSON SOLVE" | |
| return vertices | |
| def backward(ctx, grad_output): | |
| ''' | |
| overrides backward function | |
| :param grad_output: the gradient to be back-propped | |
| :return: the outgoing gradient to be back-propped | |
| ''' | |
| assert isinstance(grad_output, torch.Tensor) | |
| assert grad_output.shape[-1] >= 1 and grad_output.shape[ | |
| -1] <= 3, f'got shape {grad_output.shape} expected last dim to be in range 1-3' | |
| # when backpropping, if a layer is linear with matrix M, x ---> Mx, then the backprop of gradient g is M^Tg | |
| # in our case M = A^{-1}, so the backprop is to solve x = A^-T g. | |
| # Because A is symmetric we simply solve A^{-1}g without transposing, but this will break if A is not symmetric. | |
| # st = time.time() | |
| grad_output = grad_output.contiguous() | |
| grad = SPLUSolveLayer.solve(ctx.solver, | |
| grad_output) | |
| # print(f"BACKWARD SOLVE {time.time() - st}") | |
| # At this point we perform a NAN check because the backsolve sometimes returns NaNs. | |
| assert not torch.isnan(grad).any(), "Nan in the backward pass of the POISSON SOLVE" | |
| if USE_CUPY: | |
| mempool = cupy.get_default_memory_pool() | |
| pinned_mempool = cupy.get_default_pinned_memory_pool() | |
| mempool.free_all_blocks() | |
| pinned_mempool.free_all_blocks() | |
| del ctx.lu | |
| return None, grad | |
| def solve(solver, b): | |
| ''' | |
| solve the linear system defined by an SPLU object for a given right hand side. if the RHS is a matrix, solution will also be a matrix. | |
| :param solver: the splu object (LU decomposition) or cholesky object | |
| :param b: the right hand side to solve for | |
| :return: solution x which satisfies Ax = b where A is the poisson system lu describes | |
| ''' | |
| if USE_CUPY: | |
| b_cupy = cupy.fromDlpack(to_dlpack(b)) | |
| with cupy.cuda.Device(solver.device()): | |
| # this will hold the solution | |
| sol = cupy.ndarray(b_cupy.shape) | |
| for i in range(b_cupy.shape[2]): # b may have multiple columns, solve for each one | |
| b2d = b_cupy[..., i] # cupy.expand_dims(b_cpu[...,i],2) | |
| s = solver.solve(b2d.T).T | |
| sol[:, :, i] = s | |
| # # # convert back to torch | |
| res = from_dlpack(sol.toDlpack()) | |
| # np.save("res_gpu.npy", res.cpu().numpy()) | |
| # res = torch.zeros((1, 6889, 3), device=b.device)+ torch.mean(b) | |
| return res.type_as(b.type()) | |
| elif USE_SCIPY: | |
| #only CPU | |
| # st = time.time() | |
| assert(b.shape[0]==1), "Need to code parrallel implem on the first dim" | |
| sol = solver.solve(b[0].double().cpu().numpy()) | |
| res = torch.from_numpy(sol).to(b.device).reshape(b.shape) | |
| # print(time.time() - st) | |
| return res.type_as(b).contiguous() | |
| # Legacy code, I don't understand what is the reason for having a for loop | |
| # sol = np.ndarray(b.shape) | |
| # for i in range(b.shape[2]): # b may have multiple columns, solve for each one | |
| # b2d = b[..., i] # cupy.expand_dims(b_cpu[...,i],2) | |
| # s = lu.solve(b2d.double().cpu().float().numpy().T).T | |
| # sol[:, :, i] = s | |
| # res = torch.from_numpy(sol).to(b.device) | |
| # # np.save("res_cpu.npy", sol) | |
| # print(f"time {time.time() - st}" ) | |
| elif USE_CHOLESPY_GPU: | |
| # torch.cuda.synchronize() | |
| # # st = time.time() | |
| # assert(b.shape[0]==1), "Need to code parrallel implem on the first dim" | |
| # b = b.squeeze().double() | |
| # x = torch.zeros_like(b) | |
| # solver.solve(b, x) | |
| # # torch.cuda.synchronize() | |
| # # print(f"time cholescky GPU {time.time() - st}" ) | |
| # return x.contiguous().unsqueeze(0) | |
| # st = time.time() | |
| # print(b.get_device(), b.shape) | |
| b = b.double().contiguous() | |
| c = b.permute(1,2,0).contiguous() | |
| c = c.view(c.shape[0], -1) | |
| x = torch.zeros_like(c) | |
| solver.solve(c, x) | |
| x = x.view(b.shape[1], b.shape[2], b.shape[0]) | |
| x = x.permute(2,0,1).contiguous() | |
| # torch.cuda.synchronize() | |
| # print(f"time cholescky GPU {time.time() - st}" ) | |
| return x.contiguous() | |
| elif USE_CHOLESPY_CPU: | |
| # st = time.time() | |
| assert(b.shape[0]==1), "Need to code parrallel implem on the first dim" | |
| b = b.squeeze() | |
| b_cpu = b.cpu() | |
| x = torch.zeros_like(b_cpu) | |
| solver.solve(b_cpu, x) | |
| # print(f"time cholescky CPU {time.time() - st}" ) | |
| return x.contiguous().to(b.device).unsqueeze(0) | |
| return res.type_as(b) | |
| def _predicted_jacobians_to_vertices_via_poisson_solve(Lap, rhs, jacobians): | |
| ''' | |
| convert the predictions to the correct convention and feed it to the poisson solve | |
| ''' | |
| def _batch_rearrange_input(input): | |
| assert isinstance(input, torch.Tensor) and len(input.shape) in [2, 3] | |
| P = torch.zeros(input.shape).type_as(input) | |
| if len(input.shape) == 3: | |
| # Batched input | |
| k = input.shape[1] // 3 | |
| P[:, :k, :] = input[:, ::3] | |
| P[:, k:2 * k, :] = input[:, 1::3] | |
| P[:, 2 * k:, :] = input[:, 2::3] | |
| else: | |
| k = input.shape[0] // 3 | |
| P[:k, :] = input[::3] | |
| P[k:2 * k, :] = input[1::3] | |
| P[2 * k:, :] = input[2::3] | |
| return P | |
| def _list_rearrange_input(input): | |
| assert isinstance(input, list) and all([isinstance(x, torch.Tensor) and len(x.shape) in [2, 3] for x in input]) | |
| P = [] | |
| for p in input: | |
| P.append(_batch_rearrange_input(p)) | |
| return P | |
| if isinstance(jacobians, list): | |
| P = _list_rearrange_input(jacobians) | |
| else: | |
| P = _batch_rearrange_input(jacobians) | |
| # return solve_poisson(Lap, rhs, P) | |
| assert isinstance(P, torch.Tensor) and len(P.shape) in [2, 3] | |
| assert len(P.shape) == 3 | |
| # torch.cuda.synchronize() | |
| # st = time.time() | |
| P = P.double() | |
| input_to_solve = _multiply_sparse_2d_by_dense_3d(rhs, P) | |
| out = SPLUSolveLayer.apply(Lap, input_to_solve) | |
| out = torch.cat([torch.zeros(out.shape[0], 1, out.shape[2]).type_as(out), out], dim=1) ## Why?? Because! | |
| out = out - torch.mean(out, axis=1, keepdim=True) | |
| return out.type_as(jacobians) | |
| def _multiply_sparse_2d_by_dense_3d(mat, B): | |
| ret = [] | |
| for i in range(B.shape[0]): | |
| C = mat.multiply_with_dense(B[i, ...]) | |
| ret.append(C) | |
| ret = torch.stack(tuple(ret)) | |
| return ret | |
| class MyCuSPLU: | |
| ''' | |
| implmentation of SPLU on the gpu via CuPy | |
| ''' | |
| def __init__(self, L, U, perm_c=None, perm_r=None): | |
| # with cupy.cuda.Device(device): | |
| self.__orgL = L | |
| self.__orgU = U | |
| # self.L = csr_matrix(L) | |
| # self.U = csr_matrix(U) | |
| self.L = None | |
| self.U = None | |
| self.perm_c = perm_c | |
| self.perm_r = perm_r | |
| # self.splu = cu_splu(csr_matrix(lap)) | |
| # self.L = self.splu.L | |
| # self.U = self.splu.U | |
| # self.perm_c = self.splu.perm_c | |
| # self.perm_r = self.splu.perm_r | |
| self.__device = None | |
| def to(self, device): | |
| # assumes to receive a pytorch device object that has a "index" field | |
| # print(device) | |
| # if(self.__device is None): | |
| # raise Exception() | |
| self.__device = device.index | |
| with cupy.cuda.Device(self.__device): | |
| # self.__orgL = cupy.asarray(self.__orgL) | |
| # self.__orgU = cupy.asarray(self.__orgU) | |
| self.L = csr_matrix(self.__orgL) | |
| self.U = csr_matrix(self.__orgU) | |
| return self | |
| def device(self): | |
| return self.__device | |
| def solve(self, b): | |
| """ an attempt to use SuperLU data to efficiently solve | |
| Ax = Pr.T L U Pc.T x = b | |
| - note that L from SuperLU is in CSC format solving for c | |
| results in an efficiency warning | |
| Pr . A . Pc = L . U | |
| Lc = b - forward solve for c | |
| c = Ux - then back solve for x | |
| """ | |
| assert self.__device is not None, "need to explicitly call to() before solving" | |
| if USE_UGLY_PATCH_FOR_CUPY_ERROR: | |
| with cupy.cuda.Device(0): | |
| b[:1, :1].copy()[:, :1] | |
| with cupy.cuda.Device(self.__device): | |
| b = cupy.array(b) | |
| if self.perm_r is not None: | |
| b_old = b.copy() | |
| b[self.perm_r] = b_old | |
| assert b.device.id == self.__device, "got device" + str(b.device.id) + "instead of" + str(self.__device) | |
| # st = time.time() | |
| try: # unit_diagonal is a new kw | |
| c = spsolve_triangular(self.L, b, lower=True, unit_diagonal=True, overwrite_b=True) | |
| except TypeError: | |
| c = spsolve_triangular(self.L, b, lower=True, overwrite_b=True) | |
| px = spsolve_triangular(self.U, c, lower=False, overwrite_b=True) | |
| # print(f"time for spsolve_triangular GPU: {time.time() - st}" ) | |
| if self.perm_c is None: | |
| return px | |
| px = px[self.perm_c] | |
| # print(f'used: {mempool.used_bytes()}') | |
| # print(f'total: {mempool.total_bytes()}') | |
| return px | |
| class MyCuSPLU_CPU: | |
| ''' | |
| implmentation of SPLU on the gpu via CuPy | |
| ''' | |
| def __init__(self, L, U, perm_c=None, perm_r=None): | |
| # with cupy.cuda.Device(device): | |
| self.__orgL = L | |
| self.__orgU = U | |
| # self.L = csr_matrix(L) | |
| # self.U = csr_matrix(U) | |
| self.L = L | |
| self.U = U | |
| # self.L = L.tocsr() | |
| # self.U = U.tocsr() | |
| self.perm_c = perm_c | |
| self.perm_r = perm_r | |
| # self.splu = cu_splu(csr_matrix(lap)) | |
| # self.L = self.splu.L | |
| # self.U = self.splu.U | |
| # self.perm_c = self.splu.perm_c | |
| # self.perm_r = self.splu.perm_r | |
| self.__device = 'cpu' | |
| def to(self, device): | |
| # assumes to receive a pytorch device object that has a "index" field | |
| # print(device) | |
| # if(self.__device is None): | |
| # raise Exception() | |
| # self.__device = device.index | |
| # with cupy.cuda.Device(self.__device): | |
| # # self.__orgL = cupy.asarray(self.__orgL) | |
| # # self.__orgU = cupy.asarray(self.__orgU) | |
| # self.L = csr_matrix(self.__orgL) | |
| # self.U = csr_matrix(self.__orgU) | |
| return self | |
| def device(self): | |
| return self.__device | |
| def solve(self, b): | |
| """ an attempt to use SuperLU data to efficiently solve | |
| Ax = Pr.T L U Pc.T x = b | |
| - note that L from SuperLU is in CSC format solving for c | |
| results in an efficiency warning | |
| Pr . A . Pc = L . U | |
| Lc = b - forward solve for c | |
| c = Ux - then back solve for x | |
| """ | |
| # Could be done on GPU | |
| if self.perm_r is not None: | |
| b_old = b.copy() | |
| b[self.perm_r] = b_old | |
| # , permc_spec="NATURAL" | |
| # , permc_spec="NATURAL" | |
| # , permc_spec="NATURAL" | |
| st = time.time() | |
| # try: # unit_diagonal is a new kw | |
| # c = spsolve_triangular(self.L, b, lower=True, unit_diagonal=True, overwrite_b=True) | |
| # except TypeError: | |
| # c = spsolve_triangular(self.L, b, lower=True, overwrite_b=True) | |
| # px = spsolve_triangular(self.U, c, lower=False, overwrite_b=True) | |
| try: # unit_diagonal is a new kw | |
| c = spsolve(self.L, b, permc_spec="NATURAL") | |
| except TypeError: | |
| c = spsolve(self.L, b, permc_spec="NATURAL") | |
| px = spsolve(self.U, c, permc_spec="NATURAL") | |
| # # (self.L * c) - b / np.norm(b) | |
| print(f"time for spsolve_triangular CPU: {time.time() - st}" ) | |
| if self.perm_c is None: | |
| return px | |
| px = px[self.perm_c] | |
| # print(f'used: {mempool.used_bytes()}') | |
| # print(f'total: {mempool.total_bytes()}') | |
| return px | |
| # return cupy.asnumpy(px) | |