|
|
import torch |
|
|
import numpy as np |
|
|
import json |
|
|
|
|
|
|
|
|
class Base: |
|
|
|
|
|
def __init__(self, data, args, device='cuda', **kwargs): |
|
|
self.data = data |
|
|
self.args = args |
|
|
self.device = device |
|
|
n = int(data.feat_train.shape[0] * args.reduction_rate) |
|
|
d = data.feat_train.shape[1] |
|
|
self.nnodes_syn = n |
|
|
self.labels_syn = torch.LongTensor(self.generate_labels_syn(data)).to(device) |
|
|
|
|
|
def generate_labels_syn(self, data): |
|
|
from collections import Counter |
|
|
counter = Counter(data.labels_train) |
|
|
num_class_dict = {} |
|
|
n = len(data.labels_train) |
|
|
|
|
|
sorted_counter = sorted(counter.items(), key=lambda x:x[1]) |
|
|
sum_ = 0 |
|
|
labels_syn = [] |
|
|
self.syn_class_indices = {} |
|
|
for ix, (c, num) in enumerate(sorted_counter): |
|
|
if ix == len(sorted_counter) - 1: |
|
|
num_class_dict[c] = int(n * self.args.reduction_rate) - sum_ |
|
|
self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]] |
|
|
labels_syn += [c] * num_class_dict[c] |
|
|
else: |
|
|
num_class_dict[c] = max(int(num * self.args.reduction_rate), 1) |
|
|
sum_ += num_class_dict[c] |
|
|
self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]] |
|
|
labels_syn += [c] * num_class_dict[c] |
|
|
|
|
|
self.num_class_dict = num_class_dict |
|
|
return labels_syn |
|
|
|
|
|
def select(self): |
|
|
return |
|
|
|
|
|
class KCenter(Base): |
|
|
|
|
|
def __init__(self, data, args, device='cuda', **kwargs): |
|
|
super(KCenter, self).__init__(data, args, device='cuda', **kwargs) |
|
|
|
|
|
def select(self, embeds, inductive=False): |
|
|
|
|
|
|
|
|
num_class_dict = self.num_class_dict |
|
|
if inductive: |
|
|
idx_train = np.arange(len(self.data.idx_train)) |
|
|
else: |
|
|
idx_train = self.data.idx_train |
|
|
labels_train = self.data.labels_train |
|
|
idx_selected = [] |
|
|
|
|
|
for class_id, cnt in num_class_dict.items(): |
|
|
idx = idx_train[labels_train==class_id] |
|
|
feature = embeds[idx] |
|
|
mean = torch.mean(feature, dim=0, keepdim=True) |
|
|
|
|
|
dis = torch.cdist(feature, mean)[:,0] |
|
|
rank = torch.argsort(dis) |
|
|
idx_centers = rank[:1].tolist() |
|
|
for i in range(cnt-1): |
|
|
feature_centers = feature[idx_centers] |
|
|
dis_center = torch.cdist(feature, feature_centers) |
|
|
dis_min, _ = torch.min(dis_center, dim=-1) |
|
|
id_max = torch.argmax(dis_min).item() |
|
|
idx_centers.append(id_max) |
|
|
|
|
|
idx_selected.append(idx[idx_centers]) |
|
|
|
|
|
return np.hstack(idx_selected) |
|
|
|
|
|
|
|
|
class Herding(Base): |
|
|
|
|
|
def __init__(self, data, args, device='cuda', **kwargs): |
|
|
super(Herding, self).__init__(data, args, device='cuda', **kwargs) |
|
|
|
|
|
def select(self, embeds, inductive=False): |
|
|
num_class_dict = self.num_class_dict |
|
|
if inductive: |
|
|
idx_train = np.arange(len(self.data.idx_train)) |
|
|
else: |
|
|
idx_train = self.data.idx_train |
|
|
labels_train = self.data.labels_train |
|
|
idx_selected = [] |
|
|
|
|
|
|
|
|
for class_id, cnt in num_class_dict.items(): |
|
|
idx = idx_train[labels_train==class_id] |
|
|
features = embeds[idx] |
|
|
mean = torch.mean(features, dim=0, keepdim=True) |
|
|
selected = [] |
|
|
idx_left = np.arange(features.shape[0]).tolist() |
|
|
|
|
|
for i in range(cnt): |
|
|
det = mean*(i+1) - torch.sum(features[selected], dim=0) |
|
|
dis = torch.cdist(det, features[idx_left]) |
|
|
id_min = torch.argmin(dis) |
|
|
selected.append(idx_left[id_min]) |
|
|
del idx_left[id_min] |
|
|
idx_selected.append(idx[selected]) |
|
|
|
|
|
return np.hstack(idx_selected) |
|
|
|
|
|
|
|
|
class Random(Base): |
|
|
|
|
|
def __init__(self, data, args, device='cuda', **kwargs): |
|
|
super(Random, self).__init__(data, args, device='cuda', **kwargs) |
|
|
|
|
|
def select(self, embeds, inductive=False): |
|
|
num_class_dict = self.num_class_dict |
|
|
if inductive: |
|
|
idx_train = np.arange(len(self.data.idx_train)) |
|
|
else: |
|
|
idx_train = self.data.idx_train |
|
|
|
|
|
labels_train = self.data.labels_train |
|
|
idx_selected = [] |
|
|
|
|
|
for class_id, cnt in num_class_dict.items(): |
|
|
idx = idx_train[labels_train==class_id] |
|
|
selected = np.random.permutation(idx) |
|
|
idx_selected.append(selected[:cnt]) |
|
|
|
|
|
|
|
|
return np.hstack(idx_selected) |
|
|
|
|
|
|
|
|
class LRMC(Base): |
|
|
""" |
|
|
Coreset selection using precomputed seed nodes from the Laplacian‑Integrated |
|
|
Relaxed Maximal Clique (L‑RMC) algorithm. Seed nodes are read from a JSON |
|
|
file specified by ``args.lrmc_seeds_path`` and used to preferentially select |
|
|
training examples. Per‑class reduction counts are respected: if a class has |
|
|
fewer seeds than required, random training nodes from that class are added |
|
|
until the quota is met. |
|
|
""" |
|
|
|
|
|
def __init__(self, data, args, device='cuda', **kwargs): |
|
|
super(LRMC, self).__init__(data, args, device=device, **kwargs) |
|
|
seeds_path = getattr(args, 'lrmc_seeds_path', None) |
|
|
if seeds_path is None: |
|
|
raise ValueError( |
|
|
"LRMC method selected but no path to seed file provided. " |
|
|
"Please specify --lrmc_seeds_path when running the training script." |
|
|
) |
|
|
self.seed_nodes = self._load_seed_nodes(seeds_path) |
|
|
|
|
|
def _load_seed_nodes(self, path: str): |
|
|
|
|
|
with open(path, 'r') as f: |
|
|
js = json.load(f) |
|
|
clusters = js.get('clusters', []) |
|
|
if not clusters: |
|
|
raise ValueError(f"No clusters found in L‑RMC seeds file {path}") |
|
|
def _cluster_length(c): |
|
|
nodes = c.get('seed_nodes') or c.get('members') or [] |
|
|
return len(nodes) |
|
|
best_cluster = max(clusters, key=_cluster_length) |
|
|
nodes = best_cluster.get('seed_nodes') or best_cluster.get('members') or [] |
|
|
seed_nodes = [] |
|
|
for u in nodes: |
|
|
try: |
|
|
uid = int(u) |
|
|
except Exception: |
|
|
continue |
|
|
zero_idx = uid - 1 |
|
|
if zero_idx >= 0: |
|
|
seed_nodes.append(zero_idx) |
|
|
else: |
|
|
if uid >= 0: |
|
|
seed_nodes.append(uid) |
|
|
seed_nodes = sorted(set(seed_nodes)) |
|
|
return seed_nodes |
|
|
|
|
|
def select(self, embeds, inductive=False): |
|
|
|
|
|
if inductive: |
|
|
idx_train = np.arange(len(self.data.idx_train)) |
|
|
labels_train = self.data.labels_train |
|
|
else: |
|
|
idx_train = self.data.idx_train |
|
|
labels_train = self.data.labels_train |
|
|
num_class_dict = self.num_class_dict |
|
|
idx_selected = [] |
|
|
seed_set = set(self.seed_nodes) |
|
|
|
|
|
for class_id, cnt in num_class_dict.items(): |
|
|
class_mask = (labels_train == class_id) |
|
|
class_indices = idx_train[class_mask] |
|
|
seed_in_class = [u for u in class_indices if u in seed_set] |
|
|
selected = seed_in_class[:min(len(seed_in_class), cnt)] |
|
|
remaining_required = cnt - len(selected) |
|
|
if remaining_required > 0: |
|
|
remaining_candidates = [u for u in class_indices if u not in selected] |
|
|
if len(remaining_candidates) <= remaining_required: |
|
|
additional = remaining_candidates |
|
|
else: |
|
|
additional = np.random.choice(remaining_candidates, remaining_required, replace=False).tolist() |
|
|
selected += additional |
|
|
idx_selected.append(np.array(selected)) |
|
|
return np.hstack(idx_selected) |
|
|
|
|
|
|
|
|
|