Spaces:

betterdataai
/

IRG

Running

App Files Files Community

IRG / baselines /ClavaDDPM /scripts /utils_train.py

Zilong-Zhao

first commit

c4ac745 22 days ago

raw

history blame contribute delete

7.54 kB

	import numpy as np
	import lib
	from tab_ddpm.modules import MLPDiffusion, ResNetDiffusion
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelEncoder

	def get_model(
	model_name,
	model_params,
	):
	print(model_name)
	if model_name == 'mlp':
	model = MLPDiffusion(**model_params)
	elif model_name == 'resnet':
	model = ResNetDiffusion(**model_params)
	else:
	raise "Unknown model!"
	return model

	def update_ema(target_params, source_params, rate=0.999):
	"""
	Update target parameters to be closer to those of source parameters using
	an exponential moving average.
	:param target_params: the target parameter sequence.
	:param source_params: the source parameter sequence.
	:param rate: the EMA rate (closer to 1 means slower).
	"""
	for targ, src in zip(target_params, source_params):
	targ.detach().mul_(rate).add_(src.detach(), alpha=1 - rate)

	def concat_y_to_X(X, y):
	if X is None:
	return y.reshape(-1, 1)
	return np.concatenate([y.reshape(-1, 1), X], axis=1)

	def make_dataset_from_df(
	df,
	T,
	is_y_cond,
	ratios=[0.7, 0.2, 0.1],
	df_info=None,
	std=0
	):
	"""
	The order of the generated dataset: (y, X_num, X_cat)

	is_y_cond:
	concat: y is concatenated to X, the model learn a joint distribution of (y, X)
	embedding: y is not concatenated to X. During computations, y is embedded
	and added to the latent vector of X
	none: y column is completely ignored

	How does is_y_cond affect the generation of y?
	is_y_cond:
	concat: the model synthesizes (y, X) directly, so y is just the first column
	embedding: y is first sampled using empirical distribution of y. The model only
	synthesizes X. When returning the generated data, we return the generated X
	and the sampled y. (y is sampled from empirical distribution, instead of being
	generated by the model)
	Note that in this way, y is still not independent of X, because the model has been
	adding the embedding of y to the latent vector of X during computations.
	none:
	y is synthesized using y's empirical distribution. X is generated by the model.
	In this case, y is completely independent of X.

	Note: For now, n_classes has to be set to 0. This is because our matrix is the concatenation
	of (X_num, X_cat). In this case, if we have is_y_cond == 'concat', we can guarantee that y
	is the first column of the matrix.
	However, if we have n_classes > 0, then y is not the first column of the matrix.
	"""
	train_val_df, test_df = train_test_split(df, test_size=ratios[2], random_state=42)
	train_df, val_df = train_test_split(
	train_val_df,
	test_size=ratios[1] / (ratios[0] + ratios[1]), random_state=42
	)

	cat_column_orders = []
	num_column_orders = []
	index_to_column = list(df.columns)
	column_to_index = {col: i for i, col in enumerate(index_to_column)}

	if df_info['n_classes'] > 0:
	X_cat = {} if df_info['cat_cols'] is not None or is_y_cond == 'concat' else None
	X_num = {} if df_info['num_cols'] is not None else None
	y = {}

	cat_cols_with_y = []
	if df_info['cat_cols'] is not None:
	cat_cols_with_y += df_info['cat_cols']
	if is_y_cond == 'concat':
	cat_cols_with_y = [df_info['y_col']] + cat_cols_with_y

	if len(cat_cols_with_y) > 0:
	X_cat['train'] = train_df[cat_cols_with_y].to_numpy(dtype=np.str_)
	X_cat['val'] = val_df[cat_cols_with_y].to_numpy(dtype=np.str_)
	X_cat['test'] = test_df[cat_cols_with_y].to_numpy(dtype=np.str_)

	y['train'] = train_df[df_info['y_col']].values.astype(np.float32)
	y['val'] = val_df[df_info['y_col']].values.astype(np.float32)
	y['test'] = test_df[df_info['y_col']].values.astype(np.float32)

	if df_info['num_cols'] is not None:
	X_num['train'] = train_df[df_info['num_cols']].values.astype(np.float32)
	X_num['val'] = val_df[df_info['num_cols']].values.astype(np.float32)
	X_num['test'] = test_df[df_info['num_cols']].values.astype(np.float32)

	cat_column_orders = [column_to_index[col] for col in cat_cols_with_y]
	num_column_orders = [column_to_index[col] for col in df_info['num_cols']]

	else:
	X_cat = {} if df_info['cat_cols'] is not None else None
	X_num = {} if df_info['num_cols'] is not None or is_y_cond == 'concat' else None
	y = {}

	num_cols_with_y = []
	if df_info['num_cols'] is not None:
	num_cols_with_y += df_info['num_cols']
	if is_y_cond == 'concat':
	num_cols_with_y = [df_info['y_col']] + num_cols_with_y

	if len(num_cols_with_y) > 0:
	X_num['train'] = train_df[num_cols_with_y].values.astype(np.float32)
	X_num['val'] = val_df[num_cols_with_y].values.astype(np.float32)
	X_num['test'] = test_df[num_cols_with_y].values.astype(np.float32)

	y['train'] = train_df[df_info['y_col']].values.astype(np.float32)
	y['val'] = val_df[df_info['y_col']].values.astype(np.float32)
	y['test'] = test_df[df_info['y_col']].values.astype(np.float32)

	if df_info['cat_cols'] is not None:
	X_cat['train'] = train_df[df_info['cat_cols']].to_numpy(dtype=np.str_)
	X_cat['val'] = val_df[df_info['cat_cols']].to_numpy(dtype=np.str_)
	X_cat['test'] = test_df[df_info['cat_cols']].to_numpy(dtype=np.str_)

	cat_column_orders = [column_to_index[col] for col in df_info['cat_cols']]
	num_column_orders = [column_to_index[col] for col in num_cols_with_y]


	column_orders = num_column_orders + cat_column_orders
	column_orders = [index_to_column[index] for index in column_orders]

	label_encoders = {}
	if X_cat is not None and len(df_info['cat_cols']) > 0:
	X_cat_all = np.vstack((X_cat['train'], X_cat['val'], X_cat['test']))
	X_cat_converted = []
	for col_index in range(X_cat_all.shape[1]):
	label_encoder = LabelEncoder()
	X_cat_converted.append(label_encoder.fit_transform(X_cat_all[:, col_index]).astype(float))
	if std > 0:
	# add noise
	X_cat_converted[-1] += np.random.normal(0, std, X_cat_converted[-1].shape)
	label_encoders[col_index] = label_encoder

	X_cat_converted = np.vstack(X_cat_converted).T

	train_num = X_cat['train'].shape[0]
	val_num = X_cat['val'].shape[0]
	test_num = X_cat['test'].shape[0]

	X_cat['train'] = X_cat_converted[: train_num, :]
	X_cat['val'] = X_cat_converted[train_num: train_num + val_num, :]
	X_cat['test'] = X_cat_converted[train_num + val_num:, :]

	if len(X_num) > 0:
	X_num['train'] = np.concatenate((X_num['train'], X_cat['train']), axis=1)
	X_num['val'] = np.concatenate((X_num['val'], X_cat['val']), axis=1)
	X_num['test'] = np.concatenate((X_num['test'], X_cat['test']), axis=1)
	else:
	X_num = X_cat
	X_cat = None

	D = lib.Dataset(
	X_num,
	None,
	y,
	y_info={},
	task_type=lib.TaskType(df_info['task_type']),
	n_classes=df_info['n_classes']
	)

	return lib.transform_dataset(D, T, None), label_encoders, column_orders