File size: 8,227 Bytes
f74dd01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import torch
import numpy as np
import json
class Base:
def __init__(self, data, args, device='cuda', **kwargs):
self.data = data
self.args = args
self.device = device
n = int(data.feat_train.shape[0] * args.reduction_rate)
d = data.feat_train.shape[1]
self.nnodes_syn = n
self.labels_syn = torch.LongTensor(self.generate_labels_syn(data)).to(device)
def generate_labels_syn(self, data):
from collections import Counter
counter = Counter(data.labels_train)
num_class_dict = {}
n = len(data.labels_train)
sorted_counter = sorted(counter.items(), key=lambda x:x[1])
sum_ = 0
labels_syn = []
self.syn_class_indices = {}
for ix, (c, num) in enumerate(sorted_counter):
if ix == len(sorted_counter) - 1:
num_class_dict[c] = int(n * self.args.reduction_rate) - sum_
self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]]
labels_syn += [c] * num_class_dict[c]
else:
num_class_dict[c] = max(int(num * self.args.reduction_rate), 1)
sum_ += num_class_dict[c]
self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]]
labels_syn += [c] * num_class_dict[c]
self.num_class_dict = num_class_dict
return labels_syn
def select(self):
return
class KCenter(Base):
def __init__(self, data, args, device='cuda', **kwargs):
super(KCenter, self).__init__(data, args, device='cuda', **kwargs)
def select(self, embeds, inductive=False):
# feature: embeds
# kcenter # class by class
num_class_dict = self.num_class_dict
if inductive:
idx_train = np.arange(len(self.data.idx_train))
else:
idx_train = self.data.idx_train
labels_train = self.data.labels_train
idx_selected = []
for class_id, cnt in num_class_dict.items():
idx = idx_train[labels_train==class_id]
feature = embeds[idx]
mean = torch.mean(feature, dim=0, keepdim=True)
# dis = distance(feature, mean)[:,0]
dis = torch.cdist(feature, mean)[:,0]
rank = torch.argsort(dis)
idx_centers = rank[:1].tolist()
for i in range(cnt-1):
feature_centers = feature[idx_centers]
dis_center = torch.cdist(feature, feature_centers)
dis_min, _ = torch.min(dis_center, dim=-1)
id_max = torch.argmax(dis_min).item()
idx_centers.append(id_max)
idx_selected.append(idx[idx_centers])
# return np.array(idx_selected).reshape(-1)
return np.hstack(idx_selected)
class Herding(Base):
def __init__(self, data, args, device='cuda', **kwargs):
super(Herding, self).__init__(data, args, device='cuda', **kwargs)
def select(self, embeds, inductive=False):
num_class_dict = self.num_class_dict
if inductive:
idx_train = np.arange(len(self.data.idx_train))
else:
idx_train = self.data.idx_train
labels_train = self.data.labels_train
idx_selected = []
# herding # class by class
for class_id, cnt in num_class_dict.items():
idx = idx_train[labels_train==class_id]
features = embeds[idx]
mean = torch.mean(features, dim=0, keepdim=True)
selected = []
idx_left = np.arange(features.shape[0]).tolist()
for i in range(cnt):
det = mean*(i+1) - torch.sum(features[selected], dim=0)
dis = torch.cdist(det, features[idx_left])
id_min = torch.argmin(dis)
selected.append(idx_left[id_min])
del idx_left[id_min]
idx_selected.append(idx[selected])
# return np.array(idx_selected).reshape(-1)
return np.hstack(idx_selected)
class Random(Base):
def __init__(self, data, args, device='cuda', **kwargs):
super(Random, self).__init__(data, args, device='cuda', **kwargs)
def select(self, embeds, inductive=False):
num_class_dict = self.num_class_dict
if inductive:
idx_train = np.arange(len(self.data.idx_train))
else:
idx_train = self.data.idx_train
labels_train = self.data.labels_train
idx_selected = []
for class_id, cnt in num_class_dict.items():
idx = idx_train[labels_train==class_id]
selected = np.random.permutation(idx)
idx_selected.append(selected[:cnt])
# return np.array(idx_selected).reshape(-1)
return np.hstack(idx_selected)
class LRMC(Base):
"""
Coreset selection using precomputed seed nodes from the Laplacian‑Integrated
Relaxed Maximal Clique (L‑RMC) algorithm. Seed nodes are read from a JSON
file specified by ``args.lrmc_seeds_path`` and used to preferentially select
training examples. Per‑class reduction counts are respected: if a class has
fewer seeds than required, random training nodes from that class are added
until the quota is met.
"""
def __init__(self, data, args, device='cuda', **kwargs):
super(LRMC, self).__init__(data, args, device=device, **kwargs)
seeds_path = getattr(args, 'lrmc_seeds_path', None)
if seeds_path is None:
raise ValueError(
"LRMC method selected but no path to seed file provided. "
"Please specify --lrmc_seeds_path when running the training script."
)
self.seed_nodes = self._load_seed_nodes(seeds_path)
def _load_seed_nodes(self, path: str):
# Parse seed nodes from JSON file (supports 'seed_nodes' or 'members').
with open(path, 'r') as f:
js = json.load(f)
clusters = js.get('clusters', [])
if not clusters:
raise ValueError(f"No clusters found in L‑RMC seeds file {path}")
def _cluster_length(c):
nodes = c.get('seed_nodes') or c.get('members') or []
return len(nodes)
best_cluster = max(clusters, key=_cluster_length)
nodes = best_cluster.get('seed_nodes') or best_cluster.get('members') or []
seed_nodes = []
for u in nodes:
try:
uid = int(u)
except Exception:
continue
zero_idx = uid - 1
if zero_idx >= 0:
seed_nodes.append(zero_idx)
else:
if uid >= 0:
seed_nodes.append(uid)
seed_nodes = sorted(set(seed_nodes))
return seed_nodes
def select(self, embeds, inductive=False):
# Determine training indices depending on the inductive setting.
if inductive:
idx_train = np.arange(len(self.data.idx_train))
labels_train = self.data.labels_train
else:
idx_train = self.data.idx_train
labels_train = self.data.labels_train
num_class_dict = self.num_class_dict
idx_selected = []
seed_set = set(self.seed_nodes)
# Pick seed nodes per class; fill remainder with random nodes if needed.
for class_id, cnt in num_class_dict.items():
class_mask = (labels_train == class_id)
class_indices = idx_train[class_mask]
seed_in_class = [u for u in class_indices if u in seed_set]
selected = seed_in_class[:min(len(seed_in_class), cnt)]
remaining_required = cnt - len(selected)
if remaining_required > 0:
remaining_candidates = [u for u in class_indices if u not in selected]
if len(remaining_candidates) <= remaining_required:
additional = remaining_candidates
else:
additional = np.random.choice(remaining_candidates, remaining_required, replace=False).tolist()
selected += additional
idx_selected.append(np.array(selected))
return np.hstack(idx_selected)
|