Ali Mohsin
more elegent fixes
be97fdc
import numpy
import igl
import numpy as np
import torch
import time
from scipy.sparse import diags,coo_matrix
from scipy.sparse import csc_matrix as sp_csc
USE_TORCH_SPARSE = True ## This uses TORCH_SPARSE instead of TORCH.SPARSE
# This four are mutually exclusive
USE_CUPY = False ## This uses CUPY LU decomposition on GPU
USE_CHOLESPY_GPU = True ## This uses cholesky decomposition on GPU
USE_CHOLESPY_CPU = False ## This uses cholesky decomposition on CPU
USE_SCIPY = False ## This uses CUPY LU decomposition on CPU
# If USE_SCIPY = True, wether or not to use enhanced backend
USE_SCIKITS_UMFPACK = False ## This uses UMFPACK backend for scipy instead of naive scipy.
if USE_CHOLESPY_GPU or USE_CHOLESPY_CPU:
from cholespy import CholeskySolverD, MatrixType
if USE_CUPY and torch.cuda.is_available():
from cupyx.scipy.sparse.linalg import spsolve_triangular
from cupyx.scipy.sparse import csr_matrix
import cupy
from torch.utils.dlpack import to_dlpack, from_dlpack
from scipy.sparse.linalg import splu as scipy_splu
from scipy.sparse.linalg import spsolve_triangular, spsolve
if USE_SCIPY:
if USE_SCIKITS_UMFPACK:
# This is a bit slower in practice
# https://stackoverflow.com/questions/64401503/is-there-a-way-to-further-improve-sparse-solution-times-using-python
from scikits.umfpack import splu as scipy_splu
else:
import scipy.sparse.linalg as lg
lg.use_solver(useUmfpack=False)
# Slight performance gain with True
# conda install -c conda-forge scikit-umfpack
# forward pass goes from 0.038 to 0.036
# assumeSortedIndices=True Does not bring any boost
from scipy.sparse.linalg import splu as scipy_splu
from scipy.sparse.linalg import spsolve_triangular, spsolve
if USE_TORCH_SPARSE:
try:
import torch_sparse
except ImportError:
print("Warning: torch_sparse not available, falling back to built-in PyTorch sparse operations")
USE_TORCH_SPARSE = False
USE_UGLY_PATCH_FOR_CUPY_ERROR = False
class SparseMat:
'''
Sparse matrix object represented in the COO format
Refacto : consider killing this object, byproduct of torch_sparse instead of torch.sparse (new feature)
'''
@staticmethod
def from_M(M,ttype):
return SparseMat(M[0],M[1],M[2],M[3],ttype)
@staticmethod
def from_coo(coo,ttype):
inds = numpy.vstack((coo.row,coo.col))
return SparseMat(inds,coo.data,coo.shape[0],coo.shape[1],ttype)
def __init__(self,inds,vals,n,m,ttype):
self.n = n
self.m = m
self.vals = vals
self.inds = inds
assert(inds.shape[0] == 2)
assert(inds.shape[1] == vals.shape[0])
assert(np.max(inds[0,:]) <= n)
assert(np.max(inds[1,:] <= m))
#TODO figure out how to extract the I,J,V,m,n from this, then load a COO mat directly from npz
#self.coo_mat = coo_matrix((cupy.array(self.vals), (cupy.array(self.inds[0,:]), cupy.array(self.inds[1,:]))))
self.vals = torch.from_numpy(self.vals).type(ttype).contiguous()
self.inds = torch.from_numpy(self.inds).type(torch.int64).contiguous()
def to_coo(self):
return coo_matrix((self.vals, (self.inds[0,:], self.inds[1,:])), shape = (self.n, self.m))
def to_csc(self):
return sp_csc((self.vals, (self.inds[0,:], self.inds[1,:])), shape = (self.n, self.m))
def to_cholesky(self):
return CholeskySolverD(self.n, self.inds[0,:], self.inds[1,:], self.vals, MatrixType.COO)
def to(self,device):
self.vals = self.vals.to(device)
self.inds = self.inds.to(device)
return self
def pin_memory(self):
return
# self.vals.pin_memory()
# self.inds.pin_memory()
def multiply_with_dense(self,dense):
if USE_TORCH_SPARSE:
res = torch_sparse.spmm(self.inds,self.vals, self.n, self.m, dense)
# 1000 for loop on the above line takes 0.13 sec. Fast but annoying to have this dependency
else:
# Somehow this is not implemented for now?
# res = torch.smm(torch.sparse_coo_tensor(self.inds,self.vals) , (dense.float())).to_dense().to(dense.device)
# 1000 for loop on the above line takes 10 sec on the CPU. It is not implemented on gpu yet Slower but no dependency
if self.vals.device.type == 'cpu':
tensor_zero_hack = torch.FloatTensor([0]).double() # This line was somehow responsible for a nasty NAN bug
else:
tensor_zero_hack = torch.cuda.FloatTensor([0]).to(dense.get_device()).double()
# beware with addmm, it is experimental and gave me a NaN bug!
res = torch.sparse.addmm(tensor_zero_hack, torch.sparse_coo_tensor(self.inds.double(),self.vals.double()) , (dense.double())).type_as(self.vals)
# 1000 for loop on the above line takes 0.77 sec. Slower but no dependency
return res.contiguous()
class PoissonSystemMatrices:
'''
Holds the matrices needed to perform gradient and poisson computations
Logic : this class is supposed is supposed to hold everything needed to compute Poisson Solver
Refacto : merge with Poisson Solver
Only accept SparseMat representation
'''
def __init__(self, V, F,grad, rhs, w, ttype, is_sparse = True, lap = None, cpuonly=False):
self.dim = 3
self.is_sparse = is_sparse
self.w = w
self.rhs = rhs
self.igl_grad = grad
self.ttype = ttype
self.__splu_L = None
self.__splu_U = None
self.__splu_perm_c = None
self.__splu_perm_r = None
self.lap = lap
self.__V = V
self.__F = F
self.cpuonly = cpuonly
self.cpu_splu = None
def create_poisson_solver(self):
return PoissonSolver(self.igl_grad,self.w,self.rhs, None, self.lap)
def create_poisson_solver_from_splu_old(self, lap_L, lap_U, lap_perm_c, lap_perm_r):
w = torch.from_numpy(self.w).type(self.ttype)
lap = None
my_splu = None
if not self.cpuonly:
if USE_CUPY:
my_splu = MyCuSPLU(lap_L, lap_U, lap_perm_c, lap_perm_r)
else:
if self.lap is not None:
lap = self.lap
# my_splu = scipy_splu(self.lap)
# my_splu = MyCuSPLU_CPU(lap_L, lap_U, lap_perm_c, lap_perm_r)
else:
my_splu = MyCuSPLU_CPU(lap_L, lap_U, lap_perm_c, lap_perm_r)
# st = time.time()
# my_splu = scipy_splu(lap_L@lap_U)
# print(f"time for LU: {time.time() - st}" )
else:
if self.lap is not None:
my_splu = scipy_splu(self.lap)
else:
0/0
# my_splu = splu(lap_L)
return PoissonSolver(self.igl_grad,w,self.rhs,my_splu, lap)
def compute_poisson_solver_from_laplacian(self, compute_splu=True):
self.compute_laplacian()
if compute_splu:
self.compute_splu()
return self.create_poisson_solver_from_splu(self.__splu_L,self.__splu_U,self.__splu_perm_c,self.__splu_perm_r)
def compute_laplacian(self):
if self.lap is None:
self.lap = igl.cotmatrix(self.__V,self.__F)
self.lap = self.lap[1:, 1:]
self.lap = SparseMat.from_coo(self.lap.tocoo(), torch.float64)
if isinstance(self.lap,PoissonSystemMatrices) and self.lap.vals.shape[0] == self.__V.shape[0]:
assert(False), "this should not happen, the fix is to remove a column and row of the laplacian"
self.lap = self.lap[1:, 1:]
return self.lap
def compute_splu(self):
print("i am computing splu")
if self.cpu_splu is None:
# st = time.time()
s = scipy_splu(self.lap)
# print(f"time to compute LU {time.time() - st}")
# We are storing these attributes just in case we need to create a PoissonSolver on the GPU, they are useless for CPU case.
self.cpu_splu = s
self.__splu_L = s.L
self.__splu_U = s.U
self.__splu_perm_c = s.perm_c
self.__splu_perm_r = s.perm_r
return self.__splu_L,self.__splu_U,self.__splu_perm_c,self.__splu_perm_r
def get_new_grad(self):
grad = self.igl_grad.to_coo()
self.igl_grad = SparseMat.from_M(_convert_sparse_igl_grad_to_our_convention(grad.tocsc()),torch.float64)
return self.igl_grad
def _convert_sparse_igl_grad_to_our_convention(input):
'''
The grad operator computed from igl.grad() results in a matrix of shape (3*#tri x #verts).
It is packed such that all the x-coordinates are placed first, followed by y and z. As shown below
---------- ----------
| x1 ... | x1 ...
| x2 ... | y1 ...
| x3 ... | z1 ...
| . | .
| . | .
| y1 ... | x2 ...
| y2 ... ----> | y2 ...
| y3 ... | z2 ...
| . | .
| . | .
| z1 ... | x3 ...
| z2 ... | y3 ...
| z3 ... | z3 ...
| . | .
| . | .
---------- ----------
Note that this functionality cannot be computed trivially if because igl.grad() is a sparse tensor and as such
slicing is not well defined for sparse matrices. the following code performs the above conversion and returns a
torch.sparse tensor.
Set check to True to verify the results by converting the matrices to dense and comparing it.
'''
assert type(input) == sp_csc, 'Input should be a scipy csc sparse matrix'
T = input.tocoo()
r_c_data = np.hstack((T.row[..., np.newaxis], T.col[..., np.newaxis],
T.data[..., np.newaxis])) # horizontally stack row, col and data arrays
r_c_data = r_c_data[r_c_data[:, 0].argsort()] # sort along the row column
# Separate out x, y and z blocks
'''
Note that for the grad operator there are exactly 3 non zero elements in a row
'''
L = T.shape[0]
Tx = r_c_data[:L, :]
Ty = r_c_data[L:2 * L, :]
Tz = r_c_data[2 * L:3 * L, :]
# align the y,z rows with x so that they too start from 0
Ty[:, 0] -= Ty[0, 0]
Tz[:, 0] -= Tz[0, 0]
# 'strech' the x,y,z rows so that they can be interleaved.
Tx[:, 0] *= 3
Ty[:, 0] *= 3
Tz[:, 0] *= 3
# interleave the y,z into x
Ty[:, 0] += 1
Tz[:, 0] += 2
Tc = np.zeros((input.shape[0] * 3, 3))
Tc[::3] = Tx
Tc[1::3] = Ty
Tc[2::3] = Tz
indices = Tc[:, :-1].astype(int)
data = Tc[:, -1]
return (indices.T, data, input.shape[0], input.shape[1])
class PoissonSolver:
'''
an object to compute gradients and solve poisson
'''
def __init__(self,grad,W,rhs,my_splu, lap=None):
self.W = torch.from_numpy(W).double()
self.grad = grad
self.rhs = rhs
self.my_splu = my_splu
self.lap = lap
self.sparse_grad = grad
self.sparse_rhs = rhs
def to(self,device):
self.W = self.W.to(device)
self.sparse_grad = self.sparse_grad.to(device)
self.sparse_rhs = self.sparse_rhs.to(device)
if USE_CUPY or USE_CHOLESPY_GPU:
self.lap = self.lap.to(device)
return self
def jacobians_from_vertices(self,V):
res = _multiply_sparse_2d_by_dense_3d(self.sparse_grad, V).type_as(V)
res = res.unsqueeze(2)
return res.view(V.shape[0], -1, 3,3).transpose(2,3)
def restrict_jacobians(self,D):
assert isinstance(D, torch.Tensor) and len(D.shape) in [3, 4]
assert D.shape[-1] == 3 and D.shape[-2] == 3
assert isinstance(self.W, torch.Tensor) and len(self.W.shape) == 3
assert self.W.shape[-1] == 2 and self.W.shape[-2] == 3
if len(D.shape) == 4:
DW = torch.einsum("abcd,bde->abce", (D, self.W.type_as(D)))
else:
DW = torch.einsum("abcd,bde->abce", (D.unsqueeze(0), self.W)).squeeze(0)
if len(DW.shape)>4:
DW = DW.squeeze(0)
return DW
def restricted_jacobians_from_vertices(self,V):
return self.restrict_jacobians(self.jacobians_from_vertices(V))
def solve_poisson(self,jacobians):
# st = time.time()
assert(len(jacobians.shape) == 4)
assert(jacobians.shape[2] == 3 and jacobians.shape[3] == 3)
# torch.cuda.synchronize()
# st = time.time()
if self.my_splu is None:
if isinstance(self.lap,SparseMat):
# self.my_splu = scipy_splu(self.lap.to('cpu').to_coo())
if USE_CHOLESPY_CPU or USE_CHOLESPY_GPU:
self.my_splu = self.lap.to_cholesky()
else:
self.my_splu = scipy_splu(self.lap.to('cpu').to_coo())
else:
self.my_splu = scipy_splu(self.lap)
# print(f"computing poisson! {self.lap.vals.get_device()}")
# print(f"computing poisson! {self.lap.inds.get_device()}")
# print(f"computing poisson! {jacobians.get_device()}")
# print(f"computing poisson! {self.sparse_rhs.vals.get_device()}")
# torch.cuda.synchronize()
# print(f"SOLVER decomposition {time.time() - st}")
sol = _predicted_jacobians_to_vertices_via_poisson_solve(self.my_splu, self.sparse_rhs, jacobians.transpose(2, 3).reshape(jacobians.shape[0], -1, 3, 1).squeeze(3).contiguous())
# torch.cuda.synchronize()
# print(f"POISSON LU + SOLVE FORWARD{time.time() - st}")
c = torch.mean(sol, axis=1).unsqueeze(1) ## Beware the predicted mesh is centered here.
# print(f"time for poisson: {time.time() - st}" )
return sol - c
def pin_memory(self):
return
# self.W.pin_memory()
# self.sparse_grad.pin_memory()
# self.sparse_rhs.pin_memory()
def poisson_system_matrices_from_mesh( V,F, dim=3,ttype = torch.float64, is_sparse=True,cpuonly=False):
'''
compute poisson matricees for a given mesh
:param V vertices
:param F faces
:param dim: for now always 3 :)
:param ttype the type of tensor (e.g., float,double)
:param is_sparse: for now always true
:return: a PoissonMatricese object holding the computed matrices
'''
assert type(dim) == int and dim in [2,3], f'Only two and three dimensional meshes are supported'
assert type(is_sparse) == bool
vertices = V
faces = F
dim = 3
is_sparse = is_sparse
grad = igl.grad(vertices, faces)
# grad = np.abs(grad)
# temp_grad = grad.multiply(csr_matrix(1 / np.sqrt(grad.multiply(grad).sum(1))))
# gradients_normalized = grad / np.linalg.norm(grad, axis=1)[:, np.newaxis]
mass = _get_mass_matrix(vertices,faces,is_sparse)
## TODO 2D Case ##
if dim == 2:
grad = grad[:-grad.shape[0]//3,:]
mass = mass[:-mass.shape[0]//3,:-mass.shape[0]//3]
laplace = grad.T@mass@grad
laplace = laplace[1:, 1:]
rhs = grad.T@mass
b1,b2,_ = igl.local_basis(V,F)
w = np.stack((b1,b2),axis=-1)
# print(time.time() - s)
rhs = rhs[1:,:]
if is_sparse:
laplace = laplace.tocoo()
rhs = rhs.tocoo()
grad = grad.tocsc()
else:
laplace = laplace.toarray()
rhs = rhs.toarray()
grad = grad.toarray()
grad = SparseMat.from_M(_convert_sparse_igl_grad_to_our_convention(grad), torch.float64)
poissonbuilder = PoissonSystemMatrices(V=V,F=F,grad=grad,
rhs=SparseMat.from_coo(rhs, torch.float64), w=w,
ttype=ttype,is_sparse=is_sparse,
lap=SparseMat.from_coo(laplace, torch.float64),
cpuonly=cpuonly)
# poissonbuilder.get_new_grad()
return poissonbuilder
def _get_mass_matrix(vertices,faces,is_sparse):
d_area = igl.doublearea(vertices,faces)
d_area = np.hstack((d_area, d_area, d_area))
if is_sparse:
return sp_csc(diags(d_area))
return diags(d_area)
class SPLUSolveLayer(torch.autograd.Function):
'''
Implements the SPLU solve as a differentiable layer, with a forward and backward function
'''
@staticmethod
def forward(ctx, solver, b):
'''
override forward function
:param ctx: context object (to keep the lu object for the backward pass)
:param lu: splu object
:param b: right hand side, could be a vector or matrix
:return: the vector or matrix x which holds lu.solve(b) = x
'''
assert isinstance(b, torch.Tensor)
assert b.shape[-1] >= 1 and b.shape[-1] <= 3, f'got shape {b.shape} expected last dim to be in range 1-3'
b = b.contiguous()
ctx.solver = solver
# st = time.time()
vertices = SPLUSolveLayer.solve(solver, b).type_as(b)
# print(f"FORWARD SOLVE {time.time() - st}")
assert not torch.isnan(vertices).any(), "Nan in the forward pass of the POISSON SOLVE"
return vertices
def backward(ctx, grad_output):
'''
overrides backward function
:param grad_output: the gradient to be back-propped
:return: the outgoing gradient to be back-propped
'''
assert isinstance(grad_output, torch.Tensor)
assert grad_output.shape[-1] >= 1 and grad_output.shape[
-1] <= 3, f'got shape {grad_output.shape} expected last dim to be in range 1-3'
# when backpropping, if a layer is linear with matrix M, x ---> Mx, then the backprop of gradient g is M^Tg
# in our case M = A^{-1}, so the backprop is to solve x = A^-T g.
# Because A is symmetric we simply solve A^{-1}g without transposing, but this will break if A is not symmetric.
# st = time.time()
grad_output = grad_output.contiguous()
grad = SPLUSolveLayer.solve(ctx.solver,
grad_output)
# print(f"BACKWARD SOLVE {time.time() - st}")
# At this point we perform a NAN check because the backsolve sometimes returns NaNs.
assert not torch.isnan(grad).any(), "Nan in the backward pass of the POISSON SOLVE"
if USE_CUPY:
mempool = cupy.get_default_memory_pool()
pinned_mempool = cupy.get_default_pinned_memory_pool()
mempool.free_all_blocks()
pinned_mempool.free_all_blocks()
del ctx.lu
return None, grad
@staticmethod
def solve(solver, b):
'''
solve the linear system defined by an SPLU object for a given right hand side. if the RHS is a matrix, solution will also be a matrix.
:param solver: the splu object (LU decomposition) or cholesky object
:param b: the right hand side to solve for
:return: solution x which satisfies Ax = b where A is the poisson system lu describes
'''
if USE_CUPY:
b_cupy = cupy.fromDlpack(to_dlpack(b))
with cupy.cuda.Device(solver.device()):
# this will hold the solution
sol = cupy.ndarray(b_cupy.shape)
for i in range(b_cupy.shape[2]): # b may have multiple columns, solve for each one
b2d = b_cupy[..., i] # cupy.expand_dims(b_cpu[...,i],2)
s = solver.solve(b2d.T).T
sol[:, :, i] = s
# # # convert back to torch
res = from_dlpack(sol.toDlpack())
# np.save("res_gpu.npy", res.cpu().numpy())
# res = torch.zeros((1, 6889, 3), device=b.device)+ torch.mean(b)
return res.type_as(b.type())
elif USE_SCIPY:
#only CPU
# st = time.time()
assert(b.shape[0]==1), "Need to code parrallel implem on the first dim"
sol = solver.solve(b[0].double().cpu().numpy())
res = torch.from_numpy(sol).to(b.device).reshape(b.shape)
# print(time.time() - st)
return res.type_as(b).contiguous()
# Legacy code, I don't understand what is the reason for having a for loop
# sol = np.ndarray(b.shape)
# for i in range(b.shape[2]): # b may have multiple columns, solve for each one
# b2d = b[..., i] # cupy.expand_dims(b_cpu[...,i],2)
# s = lu.solve(b2d.double().cpu().float().numpy().T).T
# sol[:, :, i] = s
# res = torch.from_numpy(sol).to(b.device)
# # np.save("res_cpu.npy", sol)
# print(f"time {time.time() - st}" )
elif USE_CHOLESPY_GPU:
# torch.cuda.synchronize()
# # st = time.time()
# assert(b.shape[0]==1), "Need to code parrallel implem on the first dim"
# b = b.squeeze().double()
# x = torch.zeros_like(b)
# solver.solve(b, x)
# # torch.cuda.synchronize()
# # print(f"time cholescky GPU {time.time() - st}" )
# return x.contiguous().unsqueeze(0)
# st = time.time()
# print(b.get_device(), b.shape)
b = b.double().contiguous()
c = b.permute(1,2,0).contiguous()
c = c.view(c.shape[0], -1)
x = torch.zeros_like(c)
solver.solve(c, x)
x = x.view(b.shape[1], b.shape[2], b.shape[0])
x = x.permute(2,0,1).contiguous()
# torch.cuda.synchronize()
# print(f"time cholescky GPU {time.time() - st}" )
return x.contiguous()
elif USE_CHOLESPY_CPU:
# st = time.time()
assert(b.shape[0]==1), "Need to code parrallel implem on the first dim"
b = b.squeeze()
b_cpu = b.cpu()
x = torch.zeros_like(b_cpu)
solver.solve(b_cpu, x)
# print(f"time cholescky CPU {time.time() - st}" )
return x.contiguous().to(b.device).unsqueeze(0)
return res.type_as(b)
def _predicted_jacobians_to_vertices_via_poisson_solve(Lap, rhs, jacobians):
'''
convert the predictions to the correct convention and feed it to the poisson solve
'''
def _batch_rearrange_input(input):
assert isinstance(input, torch.Tensor) and len(input.shape) in [2, 3]
P = torch.zeros(input.shape).type_as(input)
if len(input.shape) == 3:
# Batched input
k = input.shape[1] // 3
P[:, :k, :] = input[:, ::3]
P[:, k:2 * k, :] = input[:, 1::3]
P[:, 2 * k:, :] = input[:, 2::3]
else:
k = input.shape[0] // 3
P[:k, :] = input[::3]
P[k:2 * k, :] = input[1::3]
P[2 * k:, :] = input[2::3]
return P
def _list_rearrange_input(input):
assert isinstance(input, list) and all([isinstance(x, torch.Tensor) and len(x.shape) in [2, 3] for x in input])
P = []
for p in input:
P.append(_batch_rearrange_input(p))
return P
if isinstance(jacobians, list):
P = _list_rearrange_input(jacobians)
else:
P = _batch_rearrange_input(jacobians)
# return solve_poisson(Lap, rhs, P)
assert isinstance(P, torch.Tensor) and len(P.shape) in [2, 3]
assert len(P.shape) == 3
# torch.cuda.synchronize()
# st = time.time()
P = P.double()
input_to_solve = _multiply_sparse_2d_by_dense_3d(rhs, P)
out = SPLUSolveLayer.apply(Lap, input_to_solve)
out = torch.cat([torch.zeros(out.shape[0], 1, out.shape[2]).type_as(out), out], dim=1) ## Why?? Because!
out = out - torch.mean(out, axis=1, keepdim=True)
return out.type_as(jacobians)
def _multiply_sparse_2d_by_dense_3d(mat, B):
ret = []
for i in range(B.shape[0]):
C = mat.multiply_with_dense(B[i, ...])
ret.append(C)
ret = torch.stack(tuple(ret))
return ret
class MyCuSPLU:
'''
implmentation of SPLU on the gpu via CuPy
'''
def __init__(self, L, U, perm_c=None, perm_r=None):
# with cupy.cuda.Device(device):
self.__orgL = L
self.__orgU = U
# self.L = csr_matrix(L)
# self.U = csr_matrix(U)
self.L = None
self.U = None
self.perm_c = perm_c
self.perm_r = perm_r
# self.splu = cu_splu(csr_matrix(lap))
# self.L = self.splu.L
# self.U = self.splu.U
# self.perm_c = self.splu.perm_c
# self.perm_r = self.splu.perm_r
self.__device = None
def to(self, device):
# assumes to receive a pytorch device object that has a "index" field
# print(device)
# if(self.__device is None):
# raise Exception()
self.__device = device.index
with cupy.cuda.Device(self.__device):
# self.__orgL = cupy.asarray(self.__orgL)
# self.__orgU = cupy.asarray(self.__orgU)
self.L = csr_matrix(self.__orgL)
self.U = csr_matrix(self.__orgU)
return self
def device(self):
return self.__device
def solve(self, b):
""" an attempt to use SuperLU data to efficiently solve
Ax = Pr.T L U Pc.T x = b
- note that L from SuperLU is in CSC format solving for c
results in an efficiency warning
Pr . A . Pc = L . U
Lc = b - forward solve for c
c = Ux - then back solve for x
"""
assert self.__device is not None, "need to explicitly call to() before solving"
if USE_UGLY_PATCH_FOR_CUPY_ERROR:
with cupy.cuda.Device(0):
b[:1, :1].copy()[:, :1]
with cupy.cuda.Device(self.__device):
b = cupy.array(b)
if self.perm_r is not None:
b_old = b.copy()
b[self.perm_r] = b_old
assert b.device.id == self.__device, "got device" + str(b.device.id) + "instead of" + str(self.__device)
# st = time.time()
try: # unit_diagonal is a new kw
c = spsolve_triangular(self.L, b, lower=True, unit_diagonal=True, overwrite_b=True)
except TypeError:
c = spsolve_triangular(self.L, b, lower=True, overwrite_b=True)
px = spsolve_triangular(self.U, c, lower=False, overwrite_b=True)
# print(f"time for spsolve_triangular GPU: {time.time() - st}" )
if self.perm_c is None:
return px
px = px[self.perm_c]
# print(f'used: {mempool.used_bytes()}')
# print(f'total: {mempool.total_bytes()}')
return px
class MyCuSPLU_CPU:
'''
implmentation of SPLU on the gpu via CuPy
'''
def __init__(self, L, U, perm_c=None, perm_r=None):
# with cupy.cuda.Device(device):
self.__orgL = L
self.__orgU = U
# self.L = csr_matrix(L)
# self.U = csr_matrix(U)
self.L = L
self.U = U
# self.L = L.tocsr()
# self.U = U.tocsr()
self.perm_c = perm_c
self.perm_r = perm_r
# self.splu = cu_splu(csr_matrix(lap))
# self.L = self.splu.L
# self.U = self.splu.U
# self.perm_c = self.splu.perm_c
# self.perm_r = self.splu.perm_r
self.__device = 'cpu'
def to(self, device):
# assumes to receive a pytorch device object that has a "index" field
# print(device)
# if(self.__device is None):
# raise Exception()
# self.__device = device.index
# with cupy.cuda.Device(self.__device):
# # self.__orgL = cupy.asarray(self.__orgL)
# # self.__orgU = cupy.asarray(self.__orgU)
# self.L = csr_matrix(self.__orgL)
# self.U = csr_matrix(self.__orgU)
return self
def device(self):
return self.__device
def solve(self, b):
""" an attempt to use SuperLU data to efficiently solve
Ax = Pr.T L U Pc.T x = b
- note that L from SuperLU is in CSC format solving for c
results in an efficiency warning
Pr . A . Pc = L . U
Lc = b - forward solve for c
c = Ux - then back solve for x
"""
# Could be done on GPU
if self.perm_r is not None:
b_old = b.copy()
b[self.perm_r] = b_old
# , permc_spec="NATURAL"
# , permc_spec="NATURAL"
# , permc_spec="NATURAL"
st = time.time()
# try: # unit_diagonal is a new kw
# c = spsolve_triangular(self.L, b, lower=True, unit_diagonal=True, overwrite_b=True)
# except TypeError:
# c = spsolve_triangular(self.L, b, lower=True, overwrite_b=True)
# px = spsolve_triangular(self.U, c, lower=False, overwrite_b=True)
try: # unit_diagonal is a new kw
c = spsolve(self.L, b, permc_spec="NATURAL")
except TypeError:
c = spsolve(self.L, b, permc_spec="NATURAL")
px = spsolve(self.U, c, permc_spec="NATURAL")
# # (self.L * c) - b / np.norm(b)
print(f"time for spsolve_triangular CPU: {time.time() - st}" )
if self.perm_c is None:
return px
px = px[self.perm_c]
# print(f'used: {mempool.used_bytes()}')
# print(f'total: {mempool.total_bytes()}')
return px
# return cupy.asnumpy(px)