clique

File size: 8,227 Bytes

f74dd01

import torch
import numpy as np
import json


class Base:

    def __init__(self, data, args, device='cuda', **kwargs):
        self.data = data
        self.args = args
        self.device = device
        n = int(data.feat_train.shape[0] * args.reduction_rate)
        d = data.feat_train.shape[1]
        self.nnodes_syn = n
        self.labels_syn = torch.LongTensor(self.generate_labels_syn(data)).to(device)

    def generate_labels_syn(self, data):
        from collections import Counter
        counter = Counter(data.labels_train)
        num_class_dict = {}
        n = len(data.labels_train)

        sorted_counter = sorted(counter.items(), key=lambda x:x[1])
        sum_ = 0
        labels_syn = []
        self.syn_class_indices = {}
        for ix, (c, num) in enumerate(sorted_counter):
            if ix == len(sorted_counter) - 1:
                num_class_dict[c] = int(n * self.args.reduction_rate) - sum_
                self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]]
                labels_syn += [c] * num_class_dict[c]
            else:
                num_class_dict[c] = max(int(num * self.args.reduction_rate), 1)
                sum_ += num_class_dict[c]
                self.syn_class_indices[c] = [len(labels_syn), len(labels_syn) + num_class_dict[c]]
                labels_syn += [c] * num_class_dict[c]

        self.num_class_dict = num_class_dict
        return labels_syn

    def select(self):
        return

class KCenter(Base):

    def __init__(self, data, args, device='cuda', **kwargs):
        super(KCenter, self).__init__(data, args, device='cuda', **kwargs)

    def select(self, embeds, inductive=False):
        # feature: embeds
        # kcenter # class by class
        num_class_dict = self.num_class_dict
        if inductive:
            idx_train = np.arange(len(self.data.idx_train))
        else:
            idx_train = self.data.idx_train
        labels_train = self.data.labels_train
        idx_selected = []

        for class_id, cnt in num_class_dict.items():
            idx = idx_train[labels_train==class_id]
            feature = embeds[idx]
            mean = torch.mean(feature, dim=0, keepdim=True)
            # dis = distance(feature, mean)[:,0]
            dis = torch.cdist(feature, mean)[:,0]
            rank = torch.argsort(dis)
            idx_centers = rank[:1].tolist()
            for i in range(cnt-1):
                feature_centers = feature[idx_centers]
                dis_center = torch.cdist(feature, feature_centers)
                dis_min, _ = torch.min(dis_center, dim=-1)
                id_max = torch.argmax(dis_min).item()
                idx_centers.append(id_max)

            idx_selected.append(idx[idx_centers])
        # return np.array(idx_selected).reshape(-1)
        return np.hstack(idx_selected)


class Herding(Base):

    def __init__(self, data, args, device='cuda', **kwargs):
        super(Herding, self).__init__(data, args, device='cuda', **kwargs)

    def select(self, embeds, inductive=False):
        num_class_dict = self.num_class_dict
        if inductive:
            idx_train = np.arange(len(self.data.idx_train))
        else:
            idx_train = self.data.idx_train
        labels_train = self.data.labels_train
        idx_selected = []

        # herding # class by class
        for class_id, cnt in num_class_dict.items():
            idx = idx_train[labels_train==class_id]
            features = embeds[idx]
            mean = torch.mean(features, dim=0, keepdim=True)
            selected = []
            idx_left = np.arange(features.shape[0]).tolist()

            for i in range(cnt):
                det = mean*(i+1) - torch.sum(features[selected], dim=0)
                dis = torch.cdist(det, features[idx_left])
                id_min = torch.argmin(dis)
                selected.append(idx_left[id_min])
                del idx_left[id_min]
            idx_selected.append(idx[selected])
        # return np.array(idx_selected).reshape(-1)
        return np.hstack(idx_selected)


class Random(Base):

    def __init__(self, data, args, device='cuda', **kwargs):
        super(Random, self).__init__(data, args, device='cuda', **kwargs)

    def select(self, embeds, inductive=False):
        num_class_dict = self.num_class_dict
        if inductive:
            idx_train = np.arange(len(self.data.idx_train))
        else:
            idx_train = self.data.idx_train

        labels_train = self.data.labels_train
        idx_selected = []

        for class_id, cnt in num_class_dict.items():
            idx = idx_train[labels_train==class_id]
            selected = np.random.permutation(idx)
            idx_selected.append(selected[:cnt])

        # return np.array(idx_selected).reshape(-1)
        return np.hstack(idx_selected)


class LRMC(Base):
    """
    Coreset selection using precomputed seed nodes from the Laplacian‑Integrated
    Relaxed Maximal Clique (L‑RMC) algorithm.  Seed nodes are read from a JSON
    file specified by ``args.lrmc_seeds_path`` and used to preferentially select
    training examples.  Per‑class reduction counts are respected: if a class has
    fewer seeds than required, random training nodes from that class are added
    until the quota is met.
    """

    def __init__(self, data, args, device='cuda', **kwargs):
        super(LRMC, self).__init__(data, args, device=device, **kwargs)
        seeds_path = getattr(args, 'lrmc_seeds_path', None)
        if seeds_path is None:
            raise ValueError(
                "LRMC method selected but no path to seed file provided. "
                "Please specify --lrmc_seeds_path when running the training script."
            )
        self.seed_nodes = self._load_seed_nodes(seeds_path)

    def _load_seed_nodes(self, path: str):
        # Parse seed nodes from JSON file (supports 'seed_nodes' or 'members').
        with open(path, 'r') as f:
            js = json.load(f)
        clusters = js.get('clusters', [])
        if not clusters:
            raise ValueError(f"No clusters found in L‑RMC seeds file {path}")
        def _cluster_length(c):
            nodes = c.get('seed_nodes') or c.get('members') or []
            return len(nodes)
        best_cluster = max(clusters, key=_cluster_length)
        nodes = best_cluster.get('seed_nodes') or best_cluster.get('members') or []
        seed_nodes = []
        for u in nodes:
            try:
                uid = int(u)
            except Exception:
                continue
            zero_idx = uid - 1
            if zero_idx >= 0:
                seed_nodes.append(zero_idx)
            else:
                if uid >= 0:
                    seed_nodes.append(uid)
        seed_nodes = sorted(set(seed_nodes))
        return seed_nodes

    def select(self, embeds, inductive=False):
        # Determine training indices depending on the inductive setting.
        if inductive:
            idx_train = np.arange(len(self.data.idx_train))
            labels_train = self.data.labels_train
        else:
            idx_train = self.data.idx_train
            labels_train = self.data.labels_train
        num_class_dict = self.num_class_dict
        idx_selected = []
        seed_set = set(self.seed_nodes)
        # Pick seed nodes per class; fill remainder with random nodes if needed.
        for class_id, cnt in num_class_dict.items():
            class_mask = (labels_train == class_id)
            class_indices = idx_train[class_mask]
            seed_in_class = [u for u in class_indices if u in seed_set]
            selected = seed_in_class[:min(len(seed_in_class), cnt)]
            remaining_required = cnt - len(selected)
            if remaining_required > 0:
                remaining_candidates = [u for u in class_indices if u not in selected]
                if len(remaining_candidates) <= remaining_required:
                    additional = remaining_candidates
                else:
                    additional = np.random.choice(remaining_candidates, remaining_required, replace=False).tolist()
                selected += additional
            idx_selected.append(np.array(selected))
        return np.hstack(idx_selected)