import torch import numpy as np import json class Base: def __init__(self, data, args, device='cuda', **kwargs): self.data = data self.args = args self.device = device n = int(data.feat_train.shape[0] * args.reduction_rate) d = data.feat_train.shape[1] self.nnodes_syn = n self.labels_syn = torch.LongTensor(self.generate_labels_syn(data)).to(device) def generate_labels_syn(self, data): from collections import Counter counter = Counter(data.labels_train) num_class_dict = {} n = len(data.labels_train) sorted_counter = sorted(counter.items(), key=lambda x:x[1]) sum_ = 0 labels_syn = [] self.syn_class_indices = {} for ix, (c, num) in enumerate(sorted_counter): if ix == len(sorted_counter) - 1: num_class_dict[c] = int(n * self.args.reduction_rate) - sum_ self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]] labels_syn += [c] * num_class_dict[c] else: num_class_dict[c] = max(int(num * self.args.reduction_rate), 1) sum_ += num_class_dict[c] self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]] labels_syn += [c] * num_class_dict[c] self.num_class_dict = num_class_dict return labels_syn def select(self): return class KCenter(Base): def __init__(self, data, args, device='cuda', **kwargs): super(KCenter, self).__init__(data, args, device='cuda', **kwargs) def select(self, embeds, inductive=False): # feature: embeds # kcenter # class by class num_class_dict = self.num_class_dict if inductive: idx_train = np.arange(len(self.data.idx_train)) else: idx_train = self.data.idx_train labels_train = self.data.labels_train idx_selected = [] for class_id, cnt in num_class_dict.items(): idx = idx_train[labels_train==class_id] feature = embeds[idx] mean = torch.mean(feature, dim=0, keepdim=True) # dis = distance(feature, mean)[:,0] dis = torch.cdist(feature, mean)[:,0] rank = torch.argsort(dis) idx_centers = rank[:1].tolist() for i in range(cnt-1): feature_centers = feature[idx_centers] dis_center = torch.cdist(feature, feature_centers) dis_min, _ = torch.min(dis_center, dim=-1) id_max = torch.argmax(dis_min).item() idx_centers.append(id_max) idx_selected.append(idx[idx_centers]) # return np.array(idx_selected).reshape(-1) return np.hstack(idx_selected) class Herding(Base): def __init__(self, data, args, device='cuda', **kwargs): super(Herding, self).__init__(data, args, device='cuda', **kwargs) def select(self, embeds, inductive=False): num_class_dict = self.num_class_dict if inductive: idx_train = np.arange(len(self.data.idx_train)) else: idx_train = self.data.idx_train labels_train = self.data.labels_train idx_selected = [] # herding # class by class for class_id, cnt in num_class_dict.items(): idx = idx_train[labels_train==class_id] features = embeds[idx] mean = torch.mean(features, dim=0, keepdim=True) selected = [] idx_left = np.arange(features.shape[0]).tolist() for i in range(cnt): det = mean*(i+1) - torch.sum(features[selected], dim=0) dis = torch.cdist(det, features[idx_left]) id_min = torch.argmin(dis) selected.append(idx_left[id_min]) del idx_left[id_min] idx_selected.append(idx[selected]) # return np.array(idx_selected).reshape(-1) return np.hstack(idx_selected) class Random(Base): def __init__(self, data, args, device='cuda', **kwargs): super(Random, self).__init__(data, args, device='cuda', **kwargs) def select(self, embeds, inductive=False): num_class_dict = self.num_class_dict if inductive: idx_train = np.arange(len(self.data.idx_train)) else: idx_train = self.data.idx_train labels_train = self.data.labels_train idx_selected = [] for class_id, cnt in num_class_dict.items(): idx = idx_train[labels_train==class_id] selected = np.random.permutation(idx) idx_selected.append(selected[:cnt]) # return np.array(idx_selected).reshape(-1) return np.hstack(idx_selected) class LRMC(Base): """ Coreset selection using precomputed seed nodes from the Laplacian‑Integrated Relaxed Maximal Clique (L‑RMC) algorithm. Seed nodes are read from a JSON file specified by ``args.lrmc_seeds_path`` and used to preferentially select training examples. Per‑class reduction counts are respected: if a class has fewer seeds than required, random training nodes from that class are added until the quota is met. """ def __init__(self, data, args, device='cuda', **kwargs): super(LRMC, self).__init__(data, args, device=device, **kwargs) seeds_path = getattr(args, 'lrmc_seeds_path', None) if seeds_path is None: raise ValueError( "LRMC method selected but no path to seed file provided. " "Please specify --lrmc_seeds_path when running the training script." ) self.seed_nodes = self._load_seed_nodes(seeds_path) def _load_seed_nodes(self, path: str): # Parse seed nodes from JSON file (supports 'seed_nodes' or 'members'). with open(path, 'r') as f: js = json.load(f) clusters = js.get('clusters', []) if not clusters: raise ValueError(f"No clusters found in L‑RMC seeds file {path}") def _cluster_length(c): nodes = c.get('seed_nodes') or c.get('members') or [] return len(nodes) best_cluster = max(clusters, key=_cluster_length) nodes = best_cluster.get('seed_nodes') or best_cluster.get('members') or [] seed_nodes = [] for u in nodes: try: uid = int(u) except Exception: continue zero_idx = uid - 1 if zero_idx >= 0: seed_nodes.append(zero_idx) else: if uid >= 0: seed_nodes.append(uid) seed_nodes = sorted(set(seed_nodes)) return seed_nodes def select(self, embeds, inductive=False): # Determine training indices depending on the inductive setting. if inductive: idx_train = np.arange(len(self.data.idx_train)) labels_train = self.data.labels_train else: idx_train = self.data.idx_train labels_train = self.data.labels_train num_class_dict = self.num_class_dict idx_selected = [] seed_set = set(self.seed_nodes) # Pick seed nodes per class; fill remainder with random nodes if needed. for class_id, cnt in num_class_dict.items(): class_mask = (labels_train == class_id) class_indices = idx_train[class_mask] seed_in_class = [u for u in class_indices if u in seed_set] selected = seed_in_class[:min(len(seed_in_class), cnt)] remaining_required = cnt - len(selected) if remaining_required > 0: remaining_candidates = [u for u in class_indices if u not in selected] if len(remaining_candidates) <= remaining_required: additional = remaining_candidates else: additional = np.random.choice(remaining_candidates, remaining_required, replace=False).tolist() selected += additional idx_selected.append(np.array(selected)) return np.hstack(idx_selected)