File size: 3,308 Bytes
7968cb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import random


class cached_property(object):
    """
    Descriptor (non-data) for building an attribute on-demand on first use.
    """
    def __init__(self, factory):
        """
        <factory> is called such: factory(instance) to build the attribute.
        """
        self._attr_name = factory.__name__
        self._factory = factory

    def __get__(self, instance, owner):
        # Build the attribute.
        attr = self._factory(instance)

        # Cache the value; hide ourselves.
        setattr(instance, self._attr_name, attr)
        return attr


def get_inds(expected_num, clu_nums, cid2clu, seq2ind):
    cur_len, cur_idx, query_cids, query_idx = 0, 0, [], []
    while cur_len < expected_num:
        cid, l = clu_nums[cur_idx % (len(clu_nums))]
        cur_idx += 1
        # check if this cluster has been selected
        if cid in query_cids:
            continue
        if random.random() > 0.5:
            for seq in cid2clu[cid]:
                # seq2ind: ensure it is in limited lengths
                if seq in seq2ind.keys():
                    query_idx.append(seq2ind[seq])
                    cur_len += 1

            query_cids.append(cid)
    return query_cids, query_idx


def get_num(N, valid_num=100):
    train_n, valid_n = int(0.9 * N), min(valid_num, int(0.05 * N))
    test_n = N - train_n - valid_n
    return train_n, valid_n, test_n


def get_full_inds(expected_num, clu_nums, cid2clu, full_seq2ind):
    cur_len, cur_idx, query_cids, query_idx = 0, 0, [], {}
    # build query_idx for each dataset
    for dataname in full_seq2ind.keys():
        if dataname not in query_idx.keys():
            query_idx[dataname] = []
    cur_idx_lst = list(range(len(clu_nums)))
    while cur_len < expected_num:
        cur_idx = random.choice(cur_idx_lst)
        cid, l = clu_nums[cur_idx]
        # check if this cluster has been selected
        if cid in query_cids:
            continue
        for seq in set(cid2clu[cid]):
            # seq2ind: ensure it is in limited lengths
            for dataname in full_seq2ind.keys():
                if seq in full_seq2ind[dataname].keys():
                    query_idx[dataname].append(full_seq2ind[dataname][seq])
                    cur_len += 1
        query_cids.append(cid)
        cur_idx_lst.remove(cur_idx)
    return query_cids, query_idx


def get_inds(expected_num, clu_nums, cid2clu, seq2ind):
    cur_len, query_cids, query_idx = 0, [], []
    cur_idx_lst = list(range(len(clu_nums)))
    while cur_len < expected_num:
        try:
            cur_idx = random.choice(cur_idx_lst)
            cid, l = clu_nums[cur_idx]
            # check if this cluster has been selected
            if cid in query_cids:
                continue

            # check if this cluster is too big
            pre = abs(expected_num - cur_len)
            aft = abs(cur_len + l - expected_num)
            if pre < aft:
                continue

            for seq in cid2clu[cid]:
                # seq2ind: ensure it is in limited lengths
                if seq in seq2ind.keys():
                    query_idx.append(seq2ind[seq])
                    cur_len += 1
            query_cids.append(cid)
            cur_idx_lst.remove(cur_idx)
        except:
            break
    return query_cids, query_idx