CGSCORE / deeprobust /graph /defense /gcn_preprocess.py
Yaning1001's picture
Add files using upload-large-folder tool
64f5718 verified
import torch.nn as nn
import torch.nn.functional as F
import math
import torch
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
from deeprobust.graph import utils
from deeprobust.graph.defense import GCN
from sklearn.utils.extmath import randomized_svd
from tqdm import tqdm
import scipy.sparse as sp
import numpy as np
from numba import njit
class GCNSVD(GCN):
"""GCNSVD is a 2 Layer Graph Convolutional Network with Truncated SVD as
preprocessing. See more details in All You Need Is Low (Rank): Defending
Against Adversarial Attacks on Graphs,
https://dl.acm.org/doi/abs/10.1145/3336191.3371789.
Parameters
----------
nfeat : int
size of input feature dimension
nhid : int
number of hidden units
nclass : int
size of output dimension
dropout : float
dropout rate for GCN
lr : float
learning rate for GCN
weight_decay : float
weight decay coefficient (l2 normalization) for GCN. When `with_relu` is True, `weight_decay` will be set to 0.
with_relu : bool
whether to use relu activation function. If False, GCN will be linearized.
with_bias: bool
whether to include bias term in GCN weights.
device: str
'cpu' or 'cuda'.
Examples
--------
We can first load dataset and then train GCNSVD.
>>> from deeprobust.graph.data import PrePtbDataset, Dataset
>>> from deeprobust.graph.defense import GCNSVD
>>> # load clean graph data
>>> data = Dataset(root='/tmp/', name='cora', seed=15)
>>> adj, features, labels = data.adj, data.features, data.labels
>>> idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test
>>> # load perturbed graph data
>>> perturbed_data = PrePtbDataset(root='/tmp/', name='cora')
>>> perturbed_adj = perturbed_data.adj
>>> # train defense model
>>> model = GCNSVD(nfeat=features.shape[1],
nhid=16,
nclass=labels.max().item() + 1,
dropout=0.5, device='cpu').to('cpu')
>>> model.fit(features, perturbed_adj, labels, idx_train, idx_val, k=20)
"""
def __init__(self, nfeat, nhid, nclass, dropout=0.5, lr=0.01, weight_decay=5e-4, with_relu=True, with_bias=True, device='cpu'):
super(GCNSVD, self).__init__(nfeat, nhid, nclass, dropout, lr, weight_decay, with_relu, with_bias, device=device)
self.device = device
self.k = None
def fit(self, features, adj, labels, idx_train, idx_val=None, k=50, train_iters=200, initialize=True, verbose=True, **kwargs):
"""First perform rank-k approximation of adjacency matrix via
truncated SVD, and then train the gcn model on the processed graph,
when idx_val is not None, pick the best model according to
the validation loss.
Parameters
----------
features :
node features
adj :
the adjacency matrix. The format could be torch.tensor or scipy matrix
labels :
node labels
idx_train :
node training indices
idx_val :
node validation indices. If not given (None), GCN training process will not adpot early stopping
k : int
number of singular values and vectors to compute.
train_iters : int
number of training epochs
initialize : bool
whether to initialize parameters before training
verbose : bool
whether to show verbose logs
"""
adj = adj.to('cpu')
modified_adj = self.truncatedSVD(adj, k=k)
self.k = k
# modified_adj_tensor = utils.sparse_mx_to_torch_sparse_tensor(self.modified_adj)
features, modified_adj, labels = utils.to_tensor(features, modified_adj, labels, device=self.device)
self.modified_adj = modified_adj
self.features = features
self.labels = labels
super().fit(features, modified_adj, labels, idx_train, idx_val, train_iters=train_iters, initialize=initialize, verbose=verbose)
# def truncatedSVD(self, data, k=50):
# """Truncated SVD on input data.
# Parameters
# ----------
# data :
# input matrix to be decomposed
# k : int
# number of singular values and vectors to compute.
# Returns
# -------
# numpy.array
# reconstructed matrix.
# """
# print('=== GCN-SVD: rank={} ==='.format(k))
# print("stage1")
# if sp.issparse(data):
# data = data.asfptype()
# U, S, V = sp.linalg.svds(data, k=k)
# print("rank_after = {}".format(len(S.nonzero()[0])))
# diag_S = np.diag(S)
# else:
# U, S, V = np.linalg.svd(data)
# U = U[:, :k]
# S = S[:k]
# V = V[:k, :]
# print("rank_before = {}".format(len(S.nonzero()[0])))
# diag_S = np.diag(S)
# print("rank_after = {}".format(len(diag_S.nonzero()[0])))
# return U @ diag_S @ V
def truncatedSVD(self, data, k=50):
"""Fast truncated SVD using randomized algorithm"""
print(f'=== Fast SVD: rank={k} ===')
# Convert torch Tensor to NumPy
if isinstance(data, torch.Tensor):
data = data.detach().cpu().numpy()
U, S, Vt = randomized_svd(data, n_components=k, n_iter=10, random_state=None)
print("rank_after =", np.count_nonzero(S))
diag_S = np.diag(S)
return U @ diag_S @ Vt
# def truncatedSVD(self, data, k=50):
# """Truncated SVD on input data using GPU (torch.linalg.svd), returning numpy array."""
# print('=== GCN-SVD (GPU): rank={} ==='.format(k))
# # 转换为 dense 并放到 GPU
# if sp.issparse(data):
# print("[Info] Converting sparse matrix to dense (may be memory-intensive).")
# data = torch.from_numpy(data.toarray()).float().to(self.device)
# elif isinstance(data, np.ndarray):
# data = torch.from_numpy(data).float().to(self.device)
# elif isinstance(data, torch.Tensor):
# data = data.float().to(self.device)
# else:
# raise TypeError("Unsupported data type for SVD.")
# print("stage1")
# #
# U, S, Vh = torch.linalg.svd(data, full_matrices=False)
# print("stage2")
# #
# U_k = U[:, :k]
# S_k = S[:k]
# V_k = Vh[:k, :]
# print(f"Singular values used (non-zero count): {(S_k != 0).sum().item()} / {len(S_k)}")
# # 还原矩阵:U * S * V
# diag_S = torch.diag(S_k)
# reconstructed = U_k @ diag_S @ V_k
# # 返回 numpy 类型(保持一致)
# return reconstructed.detach().cpu().numpy()
def predict(self, features=None, adj=None):
"""By default, the inputs should be unnormalized adjacency
Parameters
----------
features :
node features. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions.
adj :
adjcency matrix. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions.
Returns
-------
torch.FloatTensor
output (log probabilities) of GCNSVD
"""
self.eval()
if features is None and adj is None:
return self.forward(self.features, self.adj_norm)
else:
adj = self.truncatedSVD(adj, k=self.k)
if type(adj) is not torch.Tensor:
features, adj = utils.to_tensor(features, adj, device=self.device)
self.features = features
if utils.is_sparse_tensor(adj):
self.adj_norm = utils.normalize_adj_tensor(adj, sparse=True)
else:
self.adj_norm = utils.normalize_adj_tensor(adj)
return self.forward(self.features, self.adj_norm)
class GCNJaccard(GCN):
"""GCNJaccard first preprocesses input graph via droppining dissimilar
edges and train a GCN based on the processed graph. See more details in
Adversarial Examples on Graph Data: Deep Insights into Attack and Defense,
https://arxiv.org/pdf/1903.01610.pdf.
Parameters
----------
nfeat : int
size of input feature dimension
nhid : int
number of hidden units
nclass : int
size of output dimension
dropout : float
dropout rate for GCN
lr : float
learning rate for GCN
weight_decay : float
weight decay coefficient (l2 normalization) for GCN. When `with_relu` is True, `weight_decay` will be set to 0.
with_relu : bool
whether to use relu activation function. If False, GCN will be linearized.
with_bias: bool
whether to include bias term in GCN weights.
device: str
'cpu' or 'cuda'.
Examples
--------
We can first load dataset and then train GCNJaccard.
>>> from deeprobust.graph.data import PrePtbDataset, Dataset
>>> from deeprobust.graph.defense import GCNJaccard
>>> # load clean graph data
>>> data = Dataset(root='/tmp/', name='cora', seed=15)
>>> adj, features, labels = data.adj, data.features, data.labels
>>> idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test
>>> # load perturbed graph data
>>> perturbed_data = PrePtbDataset(root='/tmp/', name='cora')
>>> perturbed_adj = perturbed_data.adj
>>> # train defense model
>>> model = GCNJaccard(nfeat=features.shape[1],
nhid=16,
nclass=labels.max().item() + 1,
dropout=0.5, device='cpu').to('cpu')
>>> model.fit(features, perturbed_adj, labels, idx_train, idx_val, threshold=0.03)
"""
def __init__(self, nfeat, nhid, nclass, binary_feature=True, dropout=0.5, lr=0.01, weight_decay=5e-4, with_relu=True, with_bias=True, device='cpu'):
super(GCNJaccard, self).__init__(nfeat, nhid, nclass, dropout, lr, weight_decay, with_relu, with_bias, device=device)
self.device = device
self.binary_feature = binary_feature
def fit(self, features, adj, labels, idx_train, idx_val=None, threshold=0.01, train_iters=200, initialize=True, verbose=True, **kwargs):
"""First drop dissimilar edges with similarity smaller than given
threshold and then train the gcn model on the processed graph.
When idx_val is not None, pick the best model according to the
validation loss.
Parameters
----------
features :
node features. The format can be numpy.array or scipy matrix
adj :
the adjacency matrix.
labels :
node labels
idx_train :
node training indices
idx_val :
node validation indices. If not given (None), GCN training process will not adpot early stopping
threshold : float
similarity threshold for dropping edges. If two connected nodes with similarity smaller than threshold, the edge between them will be removed.
train_iters : int
number of training epochs
initialize : bool
whether to initialize parameters before training
verbose : bool
whether to show verbose logs
"""
self.threshold = threshold
modified_adj = self.drop_dissimilar_edges(features, adj)
# modified_adj_tensor = utils.sparse_mx_to_torch_sparse_tensor(self.modified_adj)
features, modified_adj, labels = utils.to_tensor(features, modified_adj, labels, device=self.device)
self.modified_adj = modified_adj
self.features = features
self.labels = labels
super().fit(features, modified_adj, labels, idx_train, idx_val, train_iters=train_iters, initialize=initialize, verbose=verbose)
def drop_dissimilar_edges(self, features, adj, metric='similarity'):
"""Drop dissimilar edges.(Faster version using numba)
"""
if not sp.issparse(adj):
adj = adj.to('cpu')
adj = sp.csr_matrix(adj)
adj_triu = sp.triu(adj, format='csr')
if sp.issparse(features):
features = features.todense().A # make it easier for njit processing
if metric == 'distance':
removed_cnt = dropedge_dis(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold)
else:
if self.binary_feature:
print("self.binary_feature", self.binary_feature)
removed_cnt = dropedge_jaccard(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold)
else:
removed_cnt = dropedge_cosine(adj_triu.data, adj_triu.indptr, adj_triu.indices, features, threshold=self.threshold)
print('removed %s edges in the original graph' % removed_cnt)
modified_adj = adj_triu + adj_triu.transpose()
return modified_adj
def predict(self, features=None, adj=None):
"""By default, the inputs should be unnormalized adjacency
Parameters
----------
features :
node features. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions.
adj :
adjcency matrix. If `features` and `adj` are not given, this function will use previous stored `features` and `adj` from training to make predictions.
Returns
-------
torch.FloatTensor
output (log probabilities) of GCNJaccard
"""
self.eval()
if features is None and adj is None:
return self.forward(self.features, self.adj_norm)
else:
adj = self.drop_dissimilar_edges(features, adj)
if type(adj) is not torch.Tensor:
features, adj = utils.to_tensor(features, adj, device=self.device)
self.features = features
if utils.is_sparse_tensor(adj):
self.adj_norm = utils.normalize_adj_tensor(adj, sparse=True)
else:
self.adj_norm = utils.normalize_adj_tensor(adj)
return self.forward(self.features, self.adj_norm)
def _drop_dissimilar_edges(self, features, adj):
"""Drop dissimilar edges. (Slower version)
"""
if not sp.issparse(adj):
adj = sp.csr_matrix(adj)
modified_adj = adj.copy().tolil()
# preprocessing based on features
print('=== GCN-Jaccrad ===')
edges = np.array(modified_adj.nonzero()).T
removed_cnt = 0
for edge in tqdm(edges):
n1 = edge[0]
n2 = edge[1]
if n1 > n2:
continue
if self.binary_feature:
J = self._jaccard_similarity(features[n1], features[n2])
if J < self.threshold:
modified_adj[n1, n2] = 0
modified_adj[n2, n1] = 0
removed_cnt += 1
else:
# For not binary feature, use cosine similarity
C = self._cosine_similarity(features[n1], features[n2])
if C < self.threshold:
modified_adj[n1, n2] = 0
modified_adj[n2, n1] = 0
removed_cnt += 1
print('removed %s edges in the original graph' % removed_cnt)
return modified_adj
def _jaccard_similarity(self, a, b):
intersection = a.multiply(b).count_nonzero()
J = intersection * 1.0 / (a.count_nonzero() + b.count_nonzero() - intersection)
return J
def _cosine_similarity(self, a, b):
inner_product = (a * b).sum()
C = inner_product / (np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()) + 1e-10)
return C
def __dropedge_jaccard(A, iA, jA, features, threshold):
# deprecated: for sparse feature matrix...
removed_cnt = 0
for row in range(len(iA)-1):
for i in range(iA[row], iA[row+1]):
# print(row, jA[i], A[i])
n1 = row
n2 = jA[i]
a, b = features[n1], features[n2]
intersection = a.multiply(b).count_nonzero()
J = intersection * 1.0 / (a.count_nonzero() + b.count_nonzero() - intersection)
if J < threshold:
A[i] = 0
# A[n2, n1] = 0
removed_cnt += 1
return removed_cnt
# @njit
# def dropedge_jaccard(A, iA, jA, features, threshold):
# removed_cnt = 0
# for row in range(len(iA)-1):
# for i in range(iA[row], iA[row+1]):
# # print(row, jA[i], A[i])
# n1 = row
# n2 = jA[i]
# a, b = features[n1], features[n2]
# intersection = np.count_nonzero(a*b)
# J = intersection * 1.0 / (np.count_nonzero(a) + np.count_nonzero(b) - intersection)
# if J < threshold:
# A[i] = 0
# # A[n2, n1] = 0
# removed_cnt += 1
# return removed_cnt
@njit
def dropedge_jaccard(A, iA, jA, features, threshold):
removed_cnt = 0
for row in range(len(iA)-1):
for i in range(iA[row], iA[row+1]):
# print(row, jA[i], A[i])
n1 = row
n2 = jA[i]
a, b = features[n1], features[n2]
intersection = np.count_nonzero(a*b)
union = np.count_nonzero(a) + np.count_nonzero(b) - intersection
if union == 0:
print("Running safe dropedge_jaccard")
J = 0
else:
J = intersection * 1.0 / union
if J < threshold:
A[i] = 0
# A[n2, n1] = 0
removed_cnt += 1
return removed_cnt
@njit
def dropedge_cosine(A, iA, jA, features, threshold):
removed_cnt = 0
for row in range(len(iA)-1):
for i in range(iA[row], iA[row+1]):
# print(row, jA[i], A[i])
n1 = row
n2 = jA[i]
a, b = features[n1], features[n2]
inner_product = (a * b).sum()
C = inner_product / (np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()) + 1e-8)
if C < threshold:
A[i] = 0
# A[n2, n1] = 0
removed_cnt += 1
return removed_cnt
@njit
def dropedge_dis(A, iA, jA, features, threshold):
removed_cnt = 0
for row in range(len(iA)-1):
for i in range(iA[row], iA[row+1]):
# print(row, jA[i], A[i])
n1 = row
n2 = jA[i]
C = np.linalg.norm(features[n1] - features[n2])
if C > threshold:
A[i] = 0
# A[n2, n1] = 0
removed_cnt += 1
return removed_cnt
@njit
def dropedge_both(A, iA, jA, features, threshold1=2.5, threshold2=0.01):
removed_cnt = 0
for row in range(len(iA)-1):
for i in range(iA[row], iA[row+1]):
# print(row, jA[i], A[i])
n1 = row
n2 = jA[i]
C1 = np.linalg.norm(features[n1] - features[n2])
a, b = features[n1], features[n2]
inner_product = (a * b).sum()
C2 = inner_product / (np.sqrt(np.square(a).sum() + np.square(b).sum())+ 1e-6)
if C1 > threshold1 or threshold2 < 0:
A[i] = 0
# A[n2, n1] = 0
removed_cnt += 1
return removed_cnt
if __name__ == "__main__":
from deeprobust.graph.data import PrePtbDataset, Dataset
# load clean graph data
dataset_str = 'pubmed'
data = Dataset(root='/tmp/', name=dataset_str, seed=15)
adj, features, labels = data.adj, data.features, data.labels
idx_train, idx_val, idx_test = data.idx_train, data.idx_val, data.idx_test
# load perturbed graph data
perturbed_data = PrePtbDataset(root='/tmp/', name=dataset_str)
perturbed_adj = perturbed_data.adj
# train defense model
print("Test GCNJaccard")
model = GCNJaccard(nfeat=features.shape[1],
nhid=16,
nclass=labels.max().item() + 1,
binary_feature=False,
dropout=0.5, device='cuda').to('cuda')
model.fit(features, perturbed_adj, labels, idx_train, idx_val, threshold=0.1)
model.test(idx_test)
prediction_1 = model.predict()
prediction_2 = model.predict(features, perturbed_adj)
assert (prediction_1 != prediction_2).sum() == 0
print("Test GCNSVD")
model = GCNSVD(nfeat=features.shape[1],
nhid=16,
nclass=labels.max().item() + 1,
dropout=0.5, device='cuda').to('cuda')
model.fit(features, perturbed_adj, labels, idx_train, idx_val, k=20)
model.test(idx_test)
prediction_1 = model.predict()
prediction_2 = model.predict(features, perturbed_adj)
assert (prediction_1 - prediction_2).mean() < 1e-5