CGSCORE / deeprobust /graph /defense /gcn_preprocess.py

Add files using upload-large-folder tool

64f5718 verified 11 months ago

21.1 kB

	import torch.nn as nn
	import torch.nn.functional as F
	import math
	import torch
	from torch.nn.parameter import Parameter
	from torch.nn.modules.module import Module
	from deeprobust.graph import utils
	from deeprobust.graph.defense import GCN
	from sklearn.utils.extmath import randomized_svd
	from tqdm import tqdm
	import scipy.sparse as sp
	import numpy as np
	from numba import njit

	class GCNSVD(GCN):
	"""GCNSVD is a 2 Layer Graph Convolutional Network with Truncated SVD as
	preprocessing. See more details in All You Need Is Low (Rank): Defending
	Against Adversarial Attacks on Graphs,
	https://dl.acm.org/doi/abs/10.1145/3336191.3371789.

	Parameters
	----------
	nfeat : int
	size of input feature dimension
	nhid : int
	number of hidden units
	nclass : int
	size of output dimension
	dropout : float
	dropout rate for GCN
	lr : float
	learning rate for GCN
	weight_decay : float
	weight decay coefficient (l2 normalization) for GCN. When `with_relu` is True, `weight_decay` will be set to 0.
	with_relu : bool
	whether to use relu activation function. If False, GCN will be linearized.
	with_bias: bool
	whether to include bias term in GCN weights.
	device: str
	'cpu' or 'cuda'.

	Examples
	--------
	We can first load dataset and then train GCNSVD.

	>>> from deeprobust.graph.data import PrePtbDataset, Dataset
	>>> from deeprobust.graph.defense import GCNSVD
	>>> # load clean graph data
	>>> data = Dataset(root='/tmp/', name='cora', seed=15)
	>>> adj, features, labels = data.adj, data.features, data.labels
	>>> idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test
	>>> # load perturbed graph data
	>>> perturbed_data = PrePtbDataset(root='/tmp/', name='cora')
	>>> perturbed_adj = perturbed_data.adj
	>>> # train defense model
	>>> model = GCNSVD(nfeat=features.shape[1],
	nhid=16,
	nclass=labels.max().item() + 1,
	dropout=0.5, device='cpu').to('cpu')
	>>> model.fit(features, perturbed_adj, labels, idx_train, idx_val, k=20)

	"""

	def __init__(self, nfeat, nhid, nclass, dropout=0.5, lr=0.01, weight_decay=5e-4, with_relu=True, with_bias=True, device='cpu'):

	super(GCNSVD, self).__init__(nfeat, nhid, nclass, dropout, lr, weight_decay, with_relu, with_bias, device=device)
	self.device = device
	self.k = None

	def fit(self, features, adj, labels, idx_train, idx_val=None, k=50, train_iters=200, initialize=True, verbose=True, **kwargs):
	"""First perform rank-k approximation of adjacency matrix via
	truncated SVD, and then train the gcn model on the processed graph,
	when idx_val is not None, pick the best model according to
	the validation loss.

	Parameters
	----------
	features :
	node features
	adj :
	the adjacency matrix. The format could be torch.tensor or scipy matrix
	labels :
	node labels
	idx_train :
	node training indices
	idx_val :
	node validation indices. If not given (None), GCN training process will not adpot early stopping
	k : int
	number of singular values and vectors to compute.
	train_iters : int
	number of training epochs
	initialize : bool
	whether to initialize parameters before training
	verbose : bool
	whether to show verbose logs
	"""
	adj = adj.to('cpu')
	modified_adj = self.truncatedSVD(adj, k=k)
	self.k = k
	# modified_adj_tensor = utils.sparse_mx_to_torch_sparse_tensor(self.modified_adj)
	features, modified_adj, labels = utils.to_tensor(features, modified_adj, labels, device=self.device)

	self.modified_adj = modified_adj
	self.features = features
	self.labels = labels
	super().fit(features, modified_adj, labels, idx_train, idx_val, train_iters=train_iters, initialize=initialize, verbose=verbose)

	# def truncatedSVD(self, data, k=50):
	# """Truncated SVD on input data.

	# Parameters
	# ----------
	# data :
	# input matrix to be decomposed
	# k : int
	# number of singular values and vectors to compute.

	# Returns
	# -------
	# numpy.array
	# reconstructed matrix.
	# """
	# print('=== GCN-SVD: rank={} ==='.format(k))
	# print("stage1")
	# if sp.issparse(data):
	# data = data.asfptype()
	# U, S, V = sp.linalg.svds(data, k=k)
	# print("rank_after = {}".format(len(S.nonzero()[0])))
	# diag_S = np.diag(S)
	# else:
	# U, S, V = np.linalg.svd(data)
	# U = U[:, :k]
	# S = S[:k]
	# V = V[:k, :]
	# print("rank_before = {}".format(len(S.nonzero()[0])))
	# diag_S = np.diag(S)
	# print("rank_after = {}".format(len(diag_S.nonzero()[0])))

	# return U @ diag_S @ V

	def truncatedSVD(self, data, k=50):
	"""Fast truncated SVD using randomized algorithm"""
	print(f'=== Fast SVD: rank={k} ===')

	# Convert torch Tensor to NumPy
	if isinstance(data, torch.Tensor):
	data = data.detach().cpu().numpy()

	U, S, Vt = randomized_svd(data, n_components=k, n_iter=10, random_state=None)
	print("rank_after =", np.count_nonzero(S))
	diag_S = np.diag(S)
	return U @ diag_S @ Vt

	# def truncatedSVD(self, data, k=50):

	# """Truncated SVD on input data using GPU (torch.linalg.svd), returning numpy array."""
	# print('=== GCN-SVD (GPU): rank={} ==='.format(k))
	# # 转换为 dense 并放到 GPU
	# if sp.issparse(data):
	# print("[Info] Converting sparse matrix to dense (may be memory-intensive).")
	# data = torch.from_numpy(data.toarray()).float().to(self.device)
	# elif isinstance(data, np.ndarray):
	# data = torch.from_numpy(data).float().to(self.device)
	# elif isinstance(data, torch.Tensor):
	# data = data.float().to(self.device)
	# else:
	# raise TypeError("Unsupported data type for SVD.")

	# print("stage1")
	# #
	# U, S, Vh = torch.linalg.svd(data, full_matrices=False)

	# print("stage2")
	# #
	# U_k = U[:, :k]
	# S_k = S[:k]
	# V_k = Vh[:k, :]

	# print(f"Singular values used (non-zero count): {(S_k != 0).sum().item()} / {len(S_k)}")

	# # 还原矩阵：U * S * V
	# diag_S = torch.diag(S_k)
	# reconstructed = U_k @ diag_S @ V_k

	# # 返回 numpy 类型（保持一致）
	# return reconstructed.detach().cpu().numpy()


	def predict(self, features=None, adj=None):
	"""By default, the inputs should be unnormalized adjacency

	Parameters
	----------
	features :
	node features. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions.
	adj :
	adjcency matrix. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions.


	Returns
	-------
	torch.FloatTensor
	output (log probabilities) of GCNSVD
	"""

	self.eval()
	if features is None and adj is None:
	return self.forward(self.features, self.adj_norm)
	else:
	adj = self.truncatedSVD(adj, k=self.k)
	if type(adj) is not torch.Tensor:
	features, adj = utils.to_tensor(features, adj, device=self.device)

	self.features = features
	if utils.is_sparse_tensor(adj):
	self.adj_norm = utils.normalize_adj_tensor(adj, sparse=True)
	else:
	self.adj_norm = utils.normalize_adj_tensor(adj)
	return self.forward(self.features, self.adj_norm)


	class GCNJaccard(GCN):
	"""GCNJaccard first preprocesses input graph via droppining dissimilar
	edges and train a GCN based on the processed graph. See more details in
	Adversarial Examples on Graph Data: Deep Insights into Attack and Defense,
	https://arxiv.org/pdf/1903.01610.pdf.

	Parameters
	----------
	nfeat : int
	size of input feature dimension
	nhid : int
	number of hidden units
	nclass : int
	size of output dimension
	dropout : float
	dropout rate for GCN
	lr : float
	learning rate for GCN
	weight_decay : float
	weight decay coefficient (l2 normalization) for GCN. When `with_relu` is True, `weight_decay` will be set to 0.
	with_relu : bool
	whether to use relu activation function. If False, GCN will be linearized.
	with_bias: bool
	whether to include bias term in GCN weights.
	device: str
	'cpu' or 'cuda'.

	Examples
	--------
	We can first load dataset and then train GCNJaccard.

	>>> from deeprobust.graph.data import PrePtbDataset, Dataset
	>>> from deeprobust.graph.defense import GCNJaccard
	>>> # load clean graph data
	>>> data = Dataset(root='/tmp/', name='cora', seed=15)
	>>> adj, features, labels = data.adj, data.features, data.labels
	>>> idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test
	>>> # load perturbed graph data
	>>> perturbed_data = PrePtbDataset(root='/tmp/', name='cora')
	>>> perturbed_adj = perturbed_data.adj
	>>> # train defense model
	>>> model = GCNJaccard(nfeat=features.shape[1],
	nhid=16,
	nclass=labels.max().item() + 1,
	dropout=0.5, device='cpu').to('cpu')
	>>> model.fit(features, perturbed_adj, labels, idx_train, idx_val, threshold=0.03)

	"""
	def __init__(self, nfeat, nhid, nclass, binary_feature=True, dropout=0.5, lr=0.01, weight_decay=5e-4, with_relu=True, with_bias=True, device='cpu'):

	super(GCNJaccard, self).__init__(nfeat, nhid, nclass, dropout, lr, weight_decay, with_relu, with_bias, device=device)
	self.device = device
	self.binary_feature = binary_feature

	def fit(self, features, adj, labels, idx_train, idx_val=None, threshold=0.01, train_iters=200, initialize=True, verbose=True, **kwargs):
	"""First drop dissimilar edges with similarity smaller than given
	threshold and then train the gcn model on the processed graph.
	When idx_val is not None, pick the best model according to the
	validation loss.

	Parameters
	----------
	features :
	node features. The format can be numpy.array or scipy matrix
	adj :
	the adjacency matrix.
	labels :
	node labels
	idx_train :
	node training indices
	idx_val :
	node validation indices. If not given (None), GCN training process will not adpot early stopping
	threshold : float
	similarity threshold for dropping edges. If two connected nodes with similarity smaller than threshold, the edge between them will be removed.
	train_iters : int
	number of training epochs
	initialize : bool
	whether to initialize parameters before training
	verbose : bool
	whether to show verbose logs
	"""

	self.threshold = threshold
	modified_adj = self.drop_dissimilar_edges(features, adj)
	# modified_adj_tensor = utils.sparse_mx_to_torch_sparse_tensor(self.modified_adj)
	features, modified_adj, labels = utils.to_tensor(features, modified_adj, labels, device=self.device)
	self.modified_adj = modified_adj
	self.features = features
	self.labels = labels
	super().fit(features, modified_adj, labels, idx_train, idx_val, train_iters=train_iters, initialize=initialize, verbose=verbose)

	def drop_dissimilar_edges(self, features, adj, metric='similarity'):
	"""Drop dissimilar edges.(Faster version using numba)
	"""
	if not sp.issparse(adj):
	adj = adj.to('cpu')
	adj = sp.csr_matrix(adj)

	adj_triu = sp.triu(adj, format='csr')

	if sp.issparse(features):
	features = features.todense().A # make it easier for njit processing

	if metric == 'distance':
	removed_cnt = dropedge_dis(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold)
	else:
	if self.binary_feature:
	print("self.binary_feature", self.binary_feature)
	removed_cnt = dropedge_jaccard(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold)
	else:
	removed_cnt = dropedge_cosine(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold)
	print('removed %s edges in the original graph' % removed_cnt)
	modified_adj = adj_triu + adj_triu.transpose()
	return modified_adj

	def predict(self, features=None, adj=None):
	"""By default, the inputs should be unnormalized adjacency

	Parameters
	----------
	features :
	node features. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions.
	adj :
	adjcency matrix. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions.


	Returns
	-------
	torch.FloatTensor
	output (log probabilities) of GCNJaccard
	"""

	self.eval()
	if features is None and adj is None:
	return self.forward(self.features, self.adj_norm)
	else:
	adj = self.drop_dissimilar_edges(features, adj)
	if type(adj) is not torch.Tensor:
	features, adj = utils.to_tensor(features, adj, device=self.device)

	self.features = features
	if utils.is_sparse_tensor(adj):
	self.adj_norm = utils.normalize_adj_tensor(adj, sparse=True)
	else:
	self.adj_norm = utils.normalize_adj_tensor(adj)
	return self.forward(self.features, self.adj_norm)

	def _drop_dissimilar_edges(self, features, adj):
	"""Drop dissimilar edges. (Slower version)
	"""
	if not sp.issparse(adj):
	adj = sp.csr_matrix(adj)
	modified_adj = adj.copy().tolil()

	# preprocessing based on features
	print('=== GCN-Jaccrad ===')
	edges = np.array(modified_adj.nonzero()).T
	removed_cnt = 0
	for edge in tqdm(edges):
	n1 = edge[0]
	n2 = edge[1]
	if n1 > n2:
	continue

	if self.binary_feature:
	J = self._jaccard_similarity(features[n1], features[n2])

	if J < self.threshold:
	modified_adj[n1, n2] = 0
	modified_adj[n2, n1] = 0
	removed_cnt += 1
	else:
	# For not binary feature, use cosine similarity
	C = self._cosine_similarity(features[n1], features[n2])
	if C < self.threshold:
	modified_adj[n1, n2] = 0
	modified_adj[n2, n1] = 0
	removed_cnt += 1
	print('removed %s edges in the original graph' % removed_cnt)
	return modified_adj

	def _jaccard_similarity(self, a, b):
	intersection = a.multiply(b).count_nonzero()
	J = intersection * 1.0 / (a.count_nonzero() + b.count_nonzero() - intersection)
	return J

	def _cosine_similarity(self, a, b):
	inner_product = (a * b).sum()
	C = inner_product / (np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()) + 1e-10)
	return C

	def __dropedge_jaccard(A, iA, jA, features, threshold):
	# deprecated: for sparse feature matrix...
	removed_cnt = 0
	for row in range(len(iA)-1):
	for i in range(iA[row], iA[row+1]):
	# print(row, jA[i], A[i])
	n1 = row
	n2 = jA[i]
	a, b = features[n1], features[n2]

	intersection = a.multiply(b).count_nonzero()
	J = intersection * 1.0 / (a.count_nonzero() + b.count_nonzero() - intersection)

	if J < threshold:
	A[i] = 0
	# A[n2, n1] = 0
	removed_cnt += 1
	return removed_cnt

	# @njit
	# def dropedge_jaccard(A, iA, jA, features, threshold):
	# removed_cnt = 0
	# for row in range(len(iA)-1):
	# for i in range(iA[row], iA[row+1]):
	# # print(row, jA[i], A[i])
	# n1 = row
	# n2 = jA[i]
	# a, b = features[n1], features[n2]
	# intersection = np.count_nonzero(a*b)
	# J = intersection * 1.0 / (np.count_nonzero(a) + np.count_nonzero(b) - intersection)
	# if J < threshold:
	# A[i] = 0
	# # A[n2, n1] = 0
	# removed_cnt += 1
	# return removed_cnt
	@njit
	def dropedge_jaccard(A, iA, jA, features, threshold):
	removed_cnt = 0
	for row in range(len(iA)-1):
	for i in range(iA[row], iA[row+1]):
	# print(row, jA[i], A[i])
	n1 = row
	n2 = jA[i]
	a, b = features[n1], features[n2]
	intersection = np.count_nonzero(a*b)
	union = np.count_nonzero(a) + np.count_nonzero(b) - intersection
	if union == 0:
	print("Running safe dropedge_jaccard")
	J = 0
	else:
	J = intersection * 1.0 / union

	if J < threshold:
	A[i] = 0
	# A[n2, n1] = 0
	removed_cnt += 1
	return removed_cnt


	@njit
	def dropedge_cosine(A, iA, jA, features, threshold):
	removed_cnt = 0
	for row in range(len(iA)-1):
	for i in range(iA[row], iA[row+1]):
	# print(row, jA[i], A[i])
	n1 = row
	n2 = jA[i]
	a, b = features[n1], features[n2]
	inner_product = (a * b).sum()
	C = inner_product / (np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()) + 1e-8)

	if C < threshold:
	A[i] = 0
	# A[n2, n1] = 0
	removed_cnt += 1
	return removed_cnt

	@njit
	def dropedge_dis(A, iA, jA, features, threshold):
	removed_cnt = 0
	for row in range(len(iA)-1):
	for i in range(iA[row], iA[row+1]):
	# print(row, jA[i], A[i])
	n1 = row
	n2 = jA[i]
	C = np.linalg.norm(features[n1] - features[n2])
	if C > threshold:
	A[i] = 0
	# A[n2, n1] = 0
	removed_cnt += 1

	return removed_cnt

	@njit
	def dropedge_both(A, iA, jA, features, threshold1=2.5, threshold2=0.01):
	removed_cnt = 0
	for row in range(len(iA)-1):
	for i in range(iA[row], iA[row+1]):
	# print(row, jA[i], A[i])
	n1 = row
	n2 = jA[i]
	C1 = np.linalg.norm(features[n1] - features[n2])

	a, b = features[n1], features[n2]
	inner_product = (a * b).sum()
	C2 = inner_product / (np.sqrt(np.square(a).sum() + np.square(b).sum())+ 1e-6)
	if C1 > threshold1 or threshold2 < 0:
	A[i] = 0
	# A[n2, n1] = 0
	removed_cnt += 1

	return removed_cnt


	if __name__ == "__main__":
	from deeprobust.graph.data import PrePtbDataset, Dataset
	# load clean graph data
	dataset_str = 'pubmed'
	data = Dataset(root='/tmp/', name=dataset_str, seed=15)
	adj, features, labels = data.adj, data.features, data.labels
	idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test
	# load perturbed graph data
	perturbed_data = PrePtbDataset(root='/tmp/', name=dataset_str)
	perturbed_adj = perturbed_data.adj
	# train defense model
	print("Test GCNJaccard")
	model = GCNJaccard(nfeat=features.shape[1],
	nhid=16,
	nclass=labels.max().item() + 1,
	binary_feature=False,
	dropout=0.5, device='cuda').to('cuda')
	model.fit(features, perturbed_adj, labels, idx_train, idx_val, threshold=0.1)
	model.test(idx_test)
	prediction_1 = model.predict()
	prediction_2 = model.predict(features, perturbed_adj)
	assert (prediction_1 != prediction_2).sum() == 0

	print("Test GCNSVD")
	model = GCNSVD(nfeat=features.shape[1],
	nhid=16,
	nclass=labels.max().item() + 1,
	dropout=0.5, device='cuda').to('cuda')
	model.fit(features, perturbed_adj, labels, idx_train, idx_val, k=20)
	model.test(idx_test)
	prediction_1 = model.predict()
	prediction_2 = model.predict(features, perturbed_adj)
	assert (prediction_1 - prediction_2).mean() < 1e-5