File size: 17,540 Bytes
2c5120d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
import copy
import math
from functools import partial

import numpy as np
import pandas as pd
import xgboost as xgb
from joblib import delayed, Parallel
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

from _ForestDiffusion.utils.diffusion import VPSDE, get_pc_sampler
from _ForestDiffusion.utils.utils_diffusion import build_data_xt, euler_solve


## Class for the flow-matching or diffusion model
# Categorical features should be numerical (rather than strings), make sure to use x = pd.factorize(x)[0] to make them as such
# Make sure to specific which features are categorical and which are integers
# Note: Binary features can be considered integers since they will be rounded to the nearest integer and then clipped
class ForestDiffusionModel():
    def __init__(self, X,
                 label_y=None,  # categorical/binary variable for conditional generation
                 n_t=50,  # number of noise levels
                 model='xgboost',  # model type: xgboost, random_forest, lgbm
                 diffusion_type='flow',  # flow or vp (flow is better, vp for imputation)
                 max_depth=7, 
                 n_estimators=100, 
                 eta=0.3,  # xgboost params
                 num_leaves=31,  # lgbm params
                 duplicate_K=100,  # noise samples per real data sample
                 bin_indexes=[],  # binary column indices
                 cat_indexes=[],  # categorical column indices (≥3 categories)
                 int_indexes=[],  # integer column indices
                 true_min_max_values=None,  # optional min/max bounds [[min_vals], [max_vals]]
                 gpu_hist=False,  # use GPU
                 eps=1e-3,
                 beta_min=0.1,
                 beta_max=8,
                 n_jobs=-1,  # CPU cores to use
                 seed=666):

        np.random.seed(seed)

        # Remove rows with all missing values
        obs_to_remove = np.isnan(X).all(axis=1)
        X = X[~obs_to_remove]
        if label_y is not None:
            label_y = label_y[~obs_to_remove]

        # Combine binary and integer indices
        int_indexes = int_indexes + bin_indexes

        # Set min/max values for normalization
        if true_min_max_values is not None:
            self.X_min = true_min_max_values[0]
            self.X_max = true_min_max_values[1]
        else:
            self.X_min = np.nanmin(X, axis=0, keepdims=1)
            self.X_max = np.nanmax(X, axis=0, keepdims=1)

        # Store indices for later use
        self.cat_indexes = cat_indexes
        self.int_indexes = int_indexes
        
        # One-hot encode categorical variables
        if len(self.cat_indexes) > 0:
            X, self.X_names_before, self.X_names_after = self.dummify(X)

        # Normalize data to [-1, 1] range
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        X = self.scaler.fit_transform(X)

        # Store normalized data and dimensions
        self.X1 = copy.deepcopy(X)
        self.b, self.c = X.shape
        
        # Store parameters
        self.n_t = n_t
        self.duplicate_K = duplicate_K
        self.model = model
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.seed = seed
        self.num_leaves = num_leaves
        self.eta = eta
        self.gpu_hist = gpu_hist
        self.label_y = label_y
        self.n_jobs = n_jobs
        self.diffusion_type = diffusion_type
        self.eps = eps
        self.beta_min = beta_min
        self.beta_max = beta_max

        # Check for missing data with random forest
        if model == 'random_forest' and np.sum(np.isnan(X)) > 0:
            raise ValueError('Random forest cannot handle missing data')

        # Set up diffusion process
        self.sde = None
        if diffusion_type == 'vp':
            self.sde = VPSDE(beta_min=self.beta_min, beta_max=self.beta_max, N=n_t)
        elif diffusion_type != 'flow':
            raise ValueError("diffusion_type must be 'vp' or 'flow'")

        # Duplicate data for better learning
        X1 = np.tile(X, (duplicate_K, 1)) if duplicate_K > 1 else X
        X0 = np.random.normal(size=X1.shape)  # Noise data

        # Set up class labels
        self._setup_class_labels(X1.shape[0])

        # Create training data
        X_train, y_train = build_data_xt(X0, X1, n_t=self.n_t, diffusion_type=self.diffusion_type, 
                                         eps=self.eps, sde=self.sde)

        # Train models
        self._train_models(X_train, y_train)

    def _setup_class_labels(self, data_size):
        """Set up class labels and masks for conditional generation"""
        if self.label_y is not None:
            # Check for missing values in labels
            if np.sum(np.isnan(self.label_y)) > 0:
                raise ValueError("Cannot have missing values in label_y")
                
            # Get unique labels and their probabilities
            self.y_uniques, self.y_probs = np.unique(self.label_y, return_counts=True)
            self.y_probs = self.y_probs / np.sum(self.y_probs)
            
            # Create masks for each label
            self.mask_y = {}
            for i, y_val in enumerate(self.y_uniques):
                mask = np.zeros(self.b, dtype=bool)
                mask[self.label_y == y_val] = True
                self.mask_y[y_val] = np.tile(mask, (self.duplicate_K))
        else:
            # Default to a single class
            self.y_probs = np.array([1.0])
            self.y_uniques = np.array([0])
            self.mask_y = {0: np.ones(data_size, dtype=bool)}

    def _train_models(self, X_train, y_train):
        """Train regression models for each timestep, class, and feature"""
        n_steps = self.n_t
        
        # Reshape data for training
        X_reshaped = X_train.reshape(self.n_t, self.b * self.duplicate_K, self.c)
        y_reshaped = y_train.reshape(self.b * self.duplicate_K, self.c)
        
        # Initialize model storage
        self.regr = [[[None for k in range(self.c)] for i in range(n_steps)] for j in self.y_uniques]
        
        if self.n_jobs == 1:
            # Single-threaded training
            for i in range(n_steps):
                for j in range(len(self.y_uniques)):
                    for k in range(self.c):
                        self.regr[j][i][k] = self.train_parallel(
                            X_reshaped[i][self.mask_y[self.y_uniques[j]], :],
                            y_reshaped[self.mask_y[self.y_uniques[j]], k]
                        )
        else:
            # Parallel training
            flat_models = Parallel(n_jobs=self.n_jobs)(
                delayed(self.train_parallel)(
                    X_reshaped[i][self.mask_y[self.y_uniques[j]], :],
                    y_reshaped[self.mask_y[self.y_uniques[j]], k]
                ) 
                for i in range(n_steps) 
                for j in range(len(self.y_uniques)) 
                for k in range(self.c)
            )
            
            # Reshape flat list into 3D structure
            idx = 0
            for i in range(n_steps):
                for j in range(len(self.y_uniques)):
                    for k in range(self.c):
                        self.regr[j][i][k] = flat_models[idx]
                        idx += 1

    def train_parallel(self, X_train, y_train):

        if self.model == 'random_forest':
            out = RandomForestRegressor(n_estimators=self.n_estimators, max_depth=self.max_depth,
                                        random_state=self.seed)
        elif self.model == 'lgbm':
            out = LGBMRegressor(n_estimators=self.n_estimators, num_leaves=self.num_leaves, learning_rate=0.1,
                                random_state=self.seed, force_col_wise=True)
        elif self.model == 'catboost':
            raise NotImplemented("catboost usage has been disabled, please use 'random_forest', 'lgbm' or 'xgboost' "
                                 "instead")
        elif self.model == 'xgboost':
            out = xgb.XGBRegressor(n_estimators=self.n_estimators, objective='reg:squarederror', eta=self.eta,
                                   max_depth=self.max_depth,
                                   reg_lambda=0.0, subsample=1.0, seed=self.seed,
                                   tree_method='gpu_hist' if self.gpu_hist else 'hist',
                                   gpu_id=0 if self.gpu_hist else None)
        else:
            raise Exception("model value does not exists")

        y_no_miss = ~np.isnan(y_train)
        out.fit(X_train[y_no_miss, :], y_train[y_no_miss])

        return out

    def dummify(self, X):
        df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])  # to Pandas
        df_names_before = df.columns
        for i in self.cat_indexes:
            df = pd.get_dummies(df, columns=[str(i)], prefix=str(i), dtype='float', drop_first=True)
        df_names_after = df.columns
        df = df.to_numpy()
        return df, df_names_before, df_names_after

    def unscale(self, X):
        if self.scaler is not None:  # unscale the min-max normalization
            X = self.scaler.inverse_transform(X)
        return X

    # Rounding for the categorical variables which are dummy-coded and then remove dummy-coding
    def clean_onehot_data(self, X):
        if len(self.cat_indexes) > 0:  # ex: [5, 3] and X_names_after [gender_a gender_b cartype_a cartype_b cartype_c]
            X_names_after = copy.deepcopy(self.X_names_after.to_numpy())
            prefixes = [x.split('_')[0] for x in self.X_names_after if
                        '_' in x]  # for all categorical variables, we have prefix ex: ['gender', 'gender']
            unique_prefixes = np.unique(prefixes)  # uniques prefixes
            for i in range(len(unique_prefixes)):
                cat_vars_indexes = [unique_prefixes[i] + '_' in my_name for my_name in self.X_names_after]
                cat_vars_indexes = np.where(cat_vars_indexes)[0]  # actual indexes
                cat_vars = X[:, cat_vars_indexes]  # [b, c_cat]
                # dummy variable, so third category is true if all dummies are 0
                cat_vars = np.concatenate((np.ones((cat_vars.shape[0], 1)) * 0.5, cat_vars), axis=1)
                # argmax of -1, -1, 0 is 0; so as long as they are below 0 we choose the implicit-final class
                max_index = np.argmax(cat_vars, axis=1)  # argmax across all the one-hot features (most likely category)
                X[:, cat_vars_indexes[0]] = max_index
                X_names_after[cat_vars_indexes[0]] = unique_prefixes[i]  # gender_a -> gender
            df = pd.DataFrame(X, columns=X_names_after)  # to Pandas
            df = df[self.X_names_before]  # remove all gender_b, gender_c and put everything in the right order
            X = df.to_numpy()
        return X

    # Unscale and clip to prevent going beyond min-max and also round of the integers
    def clip_extremes(self, X):
        if self.int_indexes is not None:
            for i in self.int_indexes:
                X[:, i] = np.round(X[:, i], decimals=0)
        small = (X < self.X_min).astype(float)
        X = small * self.X_min + (1 - small) * X
        big = (X > self.X_max).astype(float)
        X = big * self.X_max + (1 - big) * X
        return X

    # Return the score-fn or ode-flow output
    def my_model(self, t, y, mask_y=None):
        # y is [b*c]
        c = self.c
        b = y.shape[0] // c
        X = y.reshape(b, c)  # [b, c]

        # Output
        out = np.zeros(X.shape)  # [b, c]
        i = int(round(t * (self.n_t - 1)))
        for j, label in enumerate(self.y_uniques):
            if mask_y[label].sum() > 0:
                for k in range(self.c):
                    out[mask_y[label], k] = self.regr[j][i][k].predict(X[mask_y[label], :])

        if self.diffusion_type == 'vp':
            alpha_, sigma_ = self.sde.marginal_prob_coef(X, t)
            out = - out / sigma_
        out = out.reshape(-1)  # [b*c]
        return out

    # For imputation, we only give out and receive the missing values while ensuring consistency for the non-missing values
    # y0 is prior data, X_miss is real data
    def my_model_imputation(self, t, y, X_miss, sde=None, mask_y=None):

        y0 = np.random.normal(size=X_miss.shape)  # Noise data
        b, c = y0.shape

        if self.diffusion_type == 'vp':
            assert sde is not None
            mean, std = sde.marginal_prob(X_miss, t)
            X = mean + std * y0  # following the sde
        else:
            X = t * X_miss + (1 - t) * y0  # interpolation based on ground-truth for non-missing data
        mask_miss = np.isnan(X_miss)
        X[mask_miss] = y  # replace missing data by y(t)

        # Output
        out = np.zeros(X.shape)  # [b, c]
        i = int(round(t * (self.n_t - 1)))
        for j, label in enumerate(self.y_uniques):
            if mask_y[label].sum() > 0:
                for k in range(self.c):
                    out[mask_y[label], k] = self.regr[j][i][k].predict(X[mask_y[label], :])

        if self.diffusion_type == 'vp':
            alpha_, sigma_ = self.sde.marginal_prob_coef(X, t)
            out = - out / sigma_

        out = out[mask_miss]  # only return the missing data output
        out = out.reshape(-1)  # [-1]
        return out

    # Generate new data by solving the reverse ODE/SDE
    def generate(self, batch_size=None, n_t=None):

        # Generate prior noise
        y0 = np.random.normal(size=(self.b if batch_size is None else batch_size, self.c))

        # Generate random labels
        label_y = self.y_uniques[np.argmax(np.random.multinomial(1, self.y_probs, size=y0.shape[0]), axis=1)]
        mask_y = {}  # mask for which observations has a specific value of y
        for i in range(len(self.y_uniques)):
            mask_y[self.y_uniques[i]] = np.zeros(y0.shape[0], dtype=bool)
            mask_y[self.y_uniques[i]][label_y == self.y_uniques[i]] = True
        my_model = partial(self.my_model, mask_y=mask_y)

        if self.diffusion_type == 'vp':
            sde = VPSDE(beta_min=self.beta_min, beta_max=self.beta_max, N=self.n_t if n_t is None else n_t)
            ode_solved = get_pc_sampler(my_model, sde=sde, denoise=True, eps=self.eps)(y0.reshape(-1))
        else:
            ode_solved = euler_solve(my_model=my_model, y0=y0.reshape(-1),
                                     N=self.n_t if n_t is None else n_t)  # [t, b*c]
        solution = ode_solved.reshape(y0.shape[0], self.c)  # [b, c]
        solution = self.unscale(solution)
        solution = self.clean_onehot_data(solution)
        solution = self.clip_extremes(solution)

        # Concatenate y label if needed
        if self.label_y is not None:
            solution = np.concatenate((solution, np.expand_dims(label_y, axis=1)), axis=1)

        return solution

    # Impute missing data by solving the reverse ODE while keeping the non-missing data intact
    def impute(self, k=1, X=None, label_y=None, repaint=False, r=5, j=0.1, n_t=None):  # X is data with missing values
        assert self.diffusion_type != 'flow'  # cannot use with flow=matching

        if X is None:
            X = self.X1
        if label_y is None:
            label_y = self.label_y
        if n_t is None:
            n_t = self.n_t

        if self.diffusion_type == 'vp':
            sde = VPSDE(beta_min=self.beta_min, beta_max=self.beta_max, N=n_t)

        if label_y is None:  # single category 0
            mask_y = {}
            mask_y[0] = np.ones(X.shape[0], dtype=bool)
        else:
            mask_y = {}  # mask for which observations has a specific value of y
            for i in range(len(self.y_uniques)):
                mask_y[self.y_uniques[i]] = np.zeros(X.shape[0], dtype=bool)
                mask_y[self.y_uniques[i]][label_y == self.y_uniques[i]] = True

        my_model_imputation = partial(self.my_model_imputation, X_miss=X, sde=sde, mask_y=mask_y)

        for i in range(k):
            y0 = np.random.normal(size=X.shape)

            mask_miss = np.isnan(X)
            y0_miss = y0[mask_miss].reshape(-1)
            solution = copy.deepcopy(X)  # Solution start with dataset which contains some missing values
            if self.diffusion_type == 'vp':
                ode_solved = get_pc_sampler(my_model_imputation, sde=sde, denoise=True, repaint=repaint)(y0_miss, r=r,
                                                                                                         j=int(
                                                                                                             math.ceil(
                                                                                                                 j * n_t)))
                solution[mask_miss] = ode_solved  # replace missing values with imputed values
            solution = self.unscale(solution)
            solution = self.clean_onehot_data(solution)
            solution = self.clip_extremes(solution)
            # Concatenate y label if needed
            if self.label_y is not None:
                solution = np.concatenate((solution, np.expand_dims(label_y, axis=1)), axis=1)
            if i == 0:
                imputed_data = np.expand_dims(solution, axis=0)
            else:
                imputed_data = np.concatenate((imputed_data, np.expand_dims(solution, axis=0)), axis=0)
        return imputed_data[0] if k == 1 else imputed_data