File size: 7,542 Bytes
c4ac745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import numpy as np
import lib
from tab_ddpm.modules import MLPDiffusion, ResNetDiffusion
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def get_model(
    model_name,
    model_params,
): 
    print(model_name)
    if model_name == 'mlp':
        model = MLPDiffusion(**model_params)
    elif model_name == 'resnet':
        model = ResNetDiffusion(**model_params)
    else:
        raise "Unknown model!"
    return model

def update_ema(target_params, source_params, rate=0.999):
    """
    Update target parameters to be closer to those of source parameters using
    an exponential moving average.
    :param target_params: the target parameter sequence.
    :param source_params: the source parameter sequence.
    :param rate: the EMA rate (closer to 1 means slower).
    """
    for targ, src in zip(target_params, source_params):
        targ.detach().mul_(rate).add_(src.detach(), alpha=1 - rate)

def concat_y_to_X(X, y):
    if X is None:
        return y.reshape(-1, 1)
    return np.concatenate([y.reshape(-1, 1), X], axis=1)

def make_dataset_from_df(
        df, 
        T,
        is_y_cond,
        ratios=[0.7, 0.2, 0.1], 
        df_info=None,
        std=0
    ):
    """
    The order of the generated dataset: (y, X_num, X_cat)

    is_y_cond:
        concat: y is concatenated to X, the model learn a joint distribution of (y, X)
        embedding: y is not concatenated to X. During computations, y is embedded
            and added to the latent vector of X
        none: y column is completely ignored

    How does is_y_cond affect the generation of y?
    is_y_cond:
        concat: the model synthesizes (y, X) directly, so y is just the first column
        embedding: y is first sampled using empirical distribution of y. The model only 
            synthesizes X. When returning the generated data, we return the generated X
            and the sampled y. (y is sampled from empirical distribution, instead of being
            generated by the model)
            Note that in this way, y is still not independent of X, because the model has been
            adding the embedding of y to the latent vector of X during computations.
        none: 
            y is synthesized using y's empirical distribution. X is generated by the model.
            In this case, y is completely independent of X.

    Note: For now, n_classes has to be set to 0. This is because our matrix is the concatenation
    of (X_num, X_cat). In this case, if we have is_y_cond == 'concat', we can guarantee that y 
    is the first column of the matrix.
    However, if we have n_classes > 0, then y is not the first column of the matrix.
    """
    train_val_df, test_df = train_test_split(df, test_size=ratios[2], random_state=42)
    train_df, val_df = train_test_split(
        train_val_df, 
        test_size=ratios[1] / (ratios[0] + ratios[1]), random_state=42
    )

    cat_column_orders = []
    num_column_orders = []
    index_to_column = list(df.columns)
    column_to_index = {col: i for i, col in enumerate(index_to_column)}

    if df_info['n_classes'] > 0:
        X_cat = {} if df_info['cat_cols'] is not None or is_y_cond == 'concat' else None
        X_num = {} if df_info['num_cols'] is not None else None
        y = {}

        cat_cols_with_y = []
        if df_info['cat_cols'] is not None:
            cat_cols_with_y += df_info['cat_cols']
        if is_y_cond == 'concat':
            cat_cols_with_y = [df_info['y_col']] + cat_cols_with_y

        if len(cat_cols_with_y) > 0:
            X_cat['train'] = train_df[cat_cols_with_y].to_numpy(dtype=np.str_)
            X_cat['val'] = val_df[cat_cols_with_y].to_numpy(dtype=np.str_)
            X_cat['test'] = test_df[cat_cols_with_y].to_numpy(dtype=np.str_)
        
        y['train'] = train_df[df_info['y_col']].values.astype(np.float32)
        y['val'] = val_df[df_info['y_col']].values.astype(np.float32)
        y['test'] = test_df[df_info['y_col']].values.astype(np.float32)

        if df_info['num_cols'] is not None:
            X_num['train'] = train_df[df_info['num_cols']].values.astype(np.float32)
            X_num['val'] = val_df[df_info['num_cols']].values.astype(np.float32)
            X_num['test'] = test_df[df_info['num_cols']].values.astype(np.float32)

        cat_column_orders = [column_to_index[col] for col in cat_cols_with_y]
        num_column_orders = [column_to_index[col] for col in df_info['num_cols']]

    else:
        X_cat = {} if df_info['cat_cols'] is not None else None
        X_num = {} if df_info['num_cols'] is not None or is_y_cond == 'concat' else None
        y = {}

        num_cols_with_y = []
        if df_info['num_cols'] is not None:
            num_cols_with_y += df_info['num_cols']
        if is_y_cond == 'concat':
            num_cols_with_y = [df_info['y_col']] + num_cols_with_y

        if len(num_cols_with_y) > 0:
            X_num['train'] = train_df[num_cols_with_y].values.astype(np.float32)
            X_num['val'] = val_df[num_cols_with_y].values.astype(np.float32)
            X_num['test'] = test_df[num_cols_with_y].values.astype(np.float32)
        
        y['train'] = train_df[df_info['y_col']].values.astype(np.float32)
        y['val'] = val_df[df_info['y_col']].values.astype(np.float32)
        y['test'] = test_df[df_info['y_col']].values.astype(np.float32)

        if df_info['cat_cols'] is not None:
            X_cat['train'] = train_df[df_info['cat_cols']].to_numpy(dtype=np.str_)
            X_cat['val'] = val_df[df_info['cat_cols']].to_numpy(dtype=np.str_)
            X_cat['test'] = test_df[df_info['cat_cols']].to_numpy(dtype=np.str_)

        cat_column_orders = [column_to_index[col] for col in df_info['cat_cols']]
        num_column_orders = [column_to_index[col] for col in num_cols_with_y]

    
    column_orders = num_column_orders + cat_column_orders
    column_orders = [index_to_column[index] for index in column_orders]
    
    label_encoders = {}
    if X_cat is not None and len(df_info['cat_cols']) > 0:
        X_cat_all = np.vstack((X_cat['train'], X_cat['val'], X_cat['test']))
        X_cat_converted = []
        for col_index in range(X_cat_all.shape[1]):
            label_encoder = LabelEncoder()
            X_cat_converted.append(label_encoder.fit_transform(X_cat_all[:, col_index]).astype(float))
            if std > 0:
                # add noise
                X_cat_converted[-1] += np.random.normal(0, std, X_cat_converted[-1].shape)
            label_encoders[col_index] = label_encoder

        X_cat_converted = np.vstack(X_cat_converted).T

        train_num = X_cat['train'].shape[0]
        val_num = X_cat['val'].shape[0]
        test_num = X_cat['test'].shape[0]

        X_cat['train'] = X_cat_converted[: train_num, :]
        X_cat['val'] = X_cat_converted[train_num: train_num + val_num, :]
        X_cat['test'] = X_cat_converted[train_num + val_num:, :]

        if len(X_num) > 0:
            X_num['train'] = np.concatenate((X_num['train'], X_cat['train']), axis=1)
            X_num['val'] = np.concatenate((X_num['val'], X_cat['val']), axis=1)
            X_num['test'] = np.concatenate((X_num['test'], X_cat['test']), axis=1)
        else:
            X_num = X_cat
            X_cat = None

    D = lib.Dataset(
        X_num,
        None,
        y,
        y_info={},
        task_type=lib.TaskType(df_info['task_type']),
        n_classes=df_info['n_classes']
    )

    return lib.transform_dataset(D, T, None), label_encoders, column_orders