clique / GCond /coreset /all_methods.py
qingy2024's picture
Upload folder using huggingface_hub
f74dd01 verified
import torch
import numpy as np
import json
class Base:
def __init__(self, data, args, device='cuda', **kwargs):
self.data = data
self.args = args
self.device = device
n = int(data.feat_train.shape[0] * args.reduction_rate)
d = data.feat_train.shape[1]
self.nnodes_syn = n
self.labels_syn = torch.LongTensor(self.generate_labels_syn(data)).to(device)
def generate_labels_syn(self, data):
from collections import Counter
counter = Counter(data.labels_train)
num_class_dict = {}
n = len(data.labels_train)
sorted_counter = sorted(counter.items(), key=lambda x:x[1])
sum_ = 0
labels_syn = []
self.syn_class_indices = {}
for ix, (c, num) in enumerate(sorted_counter):
if ix == len(sorted_counter) - 1:
num_class_dict[c] = int(n * self.args.reduction_rate) - sum_
self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]]
labels_syn += [c] * num_class_dict[c]
else:
num_class_dict[c] = max(int(num * self.args.reduction_rate), 1)
sum_ += num_class_dict[c]
self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]]
labels_syn += [c] * num_class_dict[c]
self.num_class_dict = num_class_dict
return labels_syn
def select(self):
return
class KCenter(Base):
def __init__(self, data, args, device='cuda', **kwargs):
super(KCenter, self).__init__(data, args, device='cuda', **kwargs)
def select(self, embeds, inductive=False):
# feature: embeds
# kcenter # class by class
num_class_dict = self.num_class_dict
if inductive:
idx_train = np.arange(len(self.data.idx_train))
else:
idx_train = self.data.idx_train
labels_train = self.data.labels_train
idx_selected = []
for class_id, cnt in num_class_dict.items():
idx = idx_train[labels_train==class_id]
feature = embeds[idx]
mean = torch.mean(feature, dim=0, keepdim=True)
# dis = distance(feature, mean)[:,0]
dis = torch.cdist(feature, mean)[:,0]
rank = torch.argsort(dis)
idx_centers = rank[:1].tolist()
for i in range(cnt-1):
feature_centers = feature[idx_centers]
dis_center = torch.cdist(feature, feature_centers)
dis_min, _ = torch.min(dis_center, dim=-1)
id_max = torch.argmax(dis_min).item()
idx_centers.append(id_max)
idx_selected.append(idx[idx_centers])
# return np.array(idx_selected).reshape(-1)
return np.hstack(idx_selected)
class Herding(Base):
def __init__(self, data, args, device='cuda', **kwargs):
super(Herding, self).__init__(data, args, device='cuda', **kwargs)
def select(self, embeds, inductive=False):
num_class_dict = self.num_class_dict
if inductive:
idx_train = np.arange(len(self.data.idx_train))
else:
idx_train = self.data.idx_train
labels_train = self.data.labels_train
idx_selected = []
# herding # class by class
for class_id, cnt in num_class_dict.items():
idx = idx_train[labels_train==class_id]
features = embeds[idx]
mean = torch.mean(features, dim=0, keepdim=True)
selected = []
idx_left = np.arange(features.shape[0]).tolist()
for i in range(cnt):
det = mean*(i+1) - torch.sum(features[selected], dim=0)
dis = torch.cdist(det, features[idx_left])
id_min = torch.argmin(dis)
selected.append(idx_left[id_min])
del idx_left[id_min]
idx_selected.append(idx[selected])
# return np.array(idx_selected).reshape(-1)
return np.hstack(idx_selected)
class Random(Base):
def __init__(self, data, args, device='cuda', **kwargs):
super(Random, self).__init__(data, args, device='cuda', **kwargs)
def select(self, embeds, inductive=False):
num_class_dict = self.num_class_dict
if inductive:
idx_train = np.arange(len(self.data.idx_train))
else:
idx_train = self.data.idx_train
labels_train = self.data.labels_train
idx_selected = []
for class_id, cnt in num_class_dict.items():
idx = idx_train[labels_train==class_id]
selected = np.random.permutation(idx)
idx_selected.append(selected[:cnt])
# return np.array(idx_selected).reshape(-1)
return np.hstack(idx_selected)
class LRMC(Base):
"""
Coreset selection using precomputed seed nodes from the Laplacian‑Integrated
Relaxed Maximal Clique (L‑RMC) algorithm. Seed nodes are read from a JSON
file specified by ``args.lrmc_seeds_path`` and used to preferentially select
training examples. Per‑class reduction counts are respected: if a class has
fewer seeds than required, random training nodes from that class are added
until the quota is met.
"""
def __init__(self, data, args, device='cuda', **kwargs):
super(LRMC, self).__init__(data, args, device=device, **kwargs)
seeds_path = getattr(args, 'lrmc_seeds_path', None)
if seeds_path is None:
raise ValueError(
"LRMC method selected but no path to seed file provided. "
"Please specify --lrmc_seeds_path when running the training script."
)
self.seed_nodes = self._load_seed_nodes(seeds_path)
def _load_seed_nodes(self, path: str):
# Parse seed nodes from JSON file (supports 'seed_nodes' or 'members').
with open(path, 'r') as f:
js = json.load(f)
clusters = js.get('clusters', [])
if not clusters:
raise ValueError(f"No clusters found in L‑RMC seeds file {path}")
def _cluster_length(c):
nodes = c.get('seed_nodes') or c.get('members') or []
return len(nodes)
best_cluster = max(clusters, key=_cluster_length)
nodes = best_cluster.get('seed_nodes') or best_cluster.get('members') or []
seed_nodes = []
for u in nodes:
try:
uid = int(u)
except Exception:
continue
zero_idx = uid - 1
if zero_idx >= 0:
seed_nodes.append(zero_idx)
else:
if uid >= 0:
seed_nodes.append(uid)
seed_nodes = sorted(set(seed_nodes))
return seed_nodes
def select(self, embeds, inductive=False):
# Determine training indices depending on the inductive setting.
if inductive:
idx_train = np.arange(len(self.data.idx_train))
labels_train = self.data.labels_train
else:
idx_train = self.data.idx_train
labels_train = self.data.labels_train
num_class_dict = self.num_class_dict
idx_selected = []
seed_set = set(self.seed_nodes)
# Pick seed nodes per class; fill remainder with random nodes if needed.
for class_id, cnt in num_class_dict.items():
class_mask = (labels_train == class_id)
class_indices = idx_train[class_mask]
seed_in_class = [u for u in class_indices if u in seed_set]
selected = seed_in_class[:min(len(seed_in_class), cnt)]
remaining_required = cnt - len(selected)
if remaining_required > 0:
remaining_candidates = [u for u in class_indices if u not in selected]
if len(remaining_candidates) <= remaining_required:
additional = remaining_candidates
else:
additional = np.random.choice(remaining_candidates, remaining_required, replace=False).tolist()
selected += additional
idx_selected.append(np.array(selected))
return np.hstack(idx_selected)