qingy2024

Upload folder using huggingface_hub

f74dd01 verified 5 months ago

8.23 kB

	import torch
	import numpy as np
	import json


	class Base:

	def __init__(self, data, args, device='cuda', **kwargs):
	self.data = data
	self.args = args
	self.device = device
	n = int(data.feat_train.shape[0] * args.reduction_rate)
	d = data.feat_train.shape[1]
	self.nnodes_syn = n
	self.labels_syn = torch.LongTensor(self.generate_labels_syn(data)).to(device)

	def generate_labels_syn(self, data):
	from collections import Counter
	counter = Counter(data.labels_train)
	num_class_dict = {}
	n = len(data.labels_train)

	sorted_counter = sorted(counter.items(), key=lambda x:x[1])
	sum_ = 0
	labels_syn = []
	self.syn_class_indices = {}
	for ix, (c, num) in enumerate(sorted_counter):
	if ix == len(sorted_counter) - 1:
	num_class_dict[c] = int(n * self.args.reduction_rate) - sum_
	self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]]
	labels_syn += [c] * num_class_dict[c]
	else:
	num_class_dict[c] = max(int(num * self.args.reduction_rate), 1)
	sum_ += num_class_dict[c]
	self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]]
	labels_syn += [c] * num_class_dict[c]

	self.num_class_dict = num_class_dict
	return labels_syn

	def select(self):
	return

	class KCenter(Base):

	def __init__(self, data, args, device='cuda', **kwargs):
	super(KCenter, self).__init__(data, args, device='cuda', **kwargs)

	def select(self, embeds, inductive=False):
	# feature: embeds
	# kcenter # class by class
	num_class_dict = self.num_class_dict
	if inductive:
	idx_train = np.arange(len(self.data.idx_train))
	else:
	idx_train = self.data.idx_train
	labels_train = self.data.labels_train
	idx_selected = []

	for class_id, cnt in num_class_dict.items():
	idx = idx_train[labels_train==class_id]
	feature = embeds[idx]
	mean = torch.mean(feature, dim=0, keepdim=True)
	# dis = distance(feature, mean)[:,0]
	dis = torch.cdist(feature, mean)[:,0]
	rank = torch.argsort(dis)
	idx_centers = rank[:1].tolist()
	for i in range(cnt-1):
	feature_centers = feature[idx_centers]
	dis_center = torch.cdist(feature, feature_centers)
	dis_min, _ = torch.min(dis_center, dim=-1)
	id_max = torch.argmax(dis_min).item()
	idx_centers.append(id_max)

	idx_selected.append(idx[idx_centers])
	# return np.array(idx_selected).reshape(-1)
	return np.hstack(idx_selected)


	class Herding(Base):

	def __init__(self, data, args, device='cuda', **kwargs):
	super(Herding, self).__init__(data, args, device='cuda', **kwargs)

	def select(self, embeds, inductive=False):
	num_class_dict = self.num_class_dict
	if inductive:
	idx_train = np.arange(len(self.data.idx_train))
	else:
	idx_train = self.data.idx_train
	labels_train = self.data.labels_train
	idx_selected = []

	# herding # class by class
	for class_id, cnt in num_class_dict.items():
	idx = idx_train[labels_train==class_id]
	features = embeds[idx]
	mean = torch.mean(features, dim=0, keepdim=True)
	selected = []
	idx_left = np.arange(features.shape[0]).tolist()

	for i in range(cnt):
	det = mean*(i+1) - torch.sum(features[selected], dim=0)
	dis = torch.cdist(det, features[idx_left])
	id_min = torch.argmin(dis)
	selected.append(idx_left[id_min])
	del idx_left[id_min]
	idx_selected.append(idx[selected])
	# return np.array(idx_selected).reshape(-1)
	return np.hstack(idx_selected)


	class Random(Base):

	def __init__(self, data, args, device='cuda', **kwargs):
	super(Random, self).__init__(data, args, device='cuda', **kwargs)

	def select(self, embeds, inductive=False):
	num_class_dict = self.num_class_dict
	if inductive:
	idx_train = np.arange(len(self.data.idx_train))
	else:
	idx_train = self.data.idx_train

	labels_train = self.data.labels_train
	idx_selected = []

	for class_id, cnt in num_class_dict.items():
	idx = idx_train[labels_train==class_id]
	selected = np.random.permutation(idx)
	idx_selected.append(selected[:cnt])

	# return np.array(idx_selected).reshape(-1)
	return np.hstack(idx_selected)


	class LRMC(Base):
	"""
	Coreset selection using precomputed seed nodes from the Laplacian‑Integrated
	Relaxed Maximal Clique (L‑RMC) algorithm. Seed nodes are read from a JSON
	file specified by ``args.lrmc_seeds_path`` and used to preferentially select
	training examples. Per‑class reduction counts are respected: if a class has
	fewer seeds than required, random training nodes from that class are added
	until the quota is met.
	"""

	def __init__(self, data, args, device='cuda', **kwargs):
	super(LRMC, self).__init__(data, args, device=device, **kwargs)
	seeds_path = getattr(args, 'lrmc_seeds_path', None)
	if seeds_path is None:
	raise ValueError(
	"LRMC method selected but no path to seed file provided. "
	"Please specify --lrmc_seeds_path when running the training script."
	)
	self.seed_nodes = self._load_seed_nodes(seeds_path)

	def _load_seed_nodes(self, path: str):
	# Parse seed nodes from JSON file (supports 'seed_nodes' or 'members').
	with open(path, 'r') as f:
	js = json.load(f)
	clusters = js.get('clusters', [])
	if not clusters:
	raise ValueError(f"No clusters found in L‑RMC seeds file {path}")
	def _cluster_length(c):
	nodes = c.get('seed_nodes') or c.get('members') or []
	return len(nodes)
	best_cluster = max(clusters, key=_cluster_length)
	nodes = best_cluster.get('seed_nodes') or best_cluster.get('members') or []
	seed_nodes = []
	for u in nodes:
	try:
	uid = int(u)
	except Exception:
	continue
	zero_idx = uid - 1
	if zero_idx >= 0:
	seed_nodes.append(zero_idx)
	else:
	if uid >= 0:
	seed_nodes.append(uid)
	seed_nodes = sorted(set(seed_nodes))
	return seed_nodes

	def select(self, embeds, inductive=False):
	# Determine training indices depending on the inductive setting.
	if inductive:
	idx_train = np.arange(len(self.data.idx_train))
	labels_train = self.data.labels_train
	else:
	idx_train = self.data.idx_train
	labels_train = self.data.labels_train
	num_class_dict = self.num_class_dict
	idx_selected = []
	seed_set = set(self.seed_nodes)
	# Pick seed nodes per class; fill remainder with random nodes if needed.
	for class_id, cnt in num_class_dict.items():
	class_mask = (labels_train == class_id)
	class_indices = idx_train[class_mask]
	seed_in_class = [u for u in class_indices if u in seed_set]
	selected = seed_in_class[:min(len(seed_in_class), cnt)]
	remaining_required = cnt - len(selected)
	if remaining_required > 0:
	remaining_candidates = [u for u in class_indices if u not in selected]
	if len(remaining_candidates) <= remaining_required:
	additional = remaining_candidates
	else:
	additional = np.random.choice(remaining_candidates, remaining_required, replace=False).tolist()
	selected += additional
	idx_selected.append(np.array(selected))
	return np.hstack(idx_selected)