IRG / baselines /ClavaDDPM /eval /mle /tabular_dataload.py
Zilong-Zhao's picture
first commit
c4ac745
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# pylint: skip-file
"""Return training and evaluation/test datasets from config files."""
import torch
import numpy as np
import pandas as pd
from tabular_transformer import GeneralTransformer
import json
import logging
import os
CATEGORICAL = "categorical"
CONTINUOUS = "continuous"
LOGGER = logging.getLogger(__name__)
DATA_PATH = os.path.join(os.path.dirname(__file__), 'tabular_datasets')
def _load_json(path):
with open(path) as json_file:
return json.load(json_file)
def _load_file(filename, loader):
local_path = os.path.join(DATA_PATH, filename)
if loader == np.load:
return loader(local_path, allow_pickle=True)
return loader(local_path)
def _get_columns(metadata):
categorical_columns = list()
for column_idx, column in enumerate(metadata['columns']):
if column['type'] == CATEGORICAL:
categorical_columns.append(column_idx)
return categorical_columns
def load_data(name):
data_dir = f'data/{name}'
info_path = f'{data_dir}/info.json'
train = pd.read_csv(f'{data_dir}/train.csv').to_numpy()
test = pd.read_csv(f'{data_dir}/test.csv').to_numpy()
with open(f'{data_dir}/info.json', 'r') as f:
info = json.load(f)
task_type = info['task_type']
num_cols = info['num_col_idx']
cat_cols = info['cat_col_idx']
target_cols = info['target_col_idx']
if task_type != 'regression':
cat_cols = cat_cols + target_cols
return train, test, (cat_cols, info)
def get_dataset(FLAGS, evaluation=False):
batch_size = FLAGS.training_batch_size if not evaluation else FLAGS.eval_batch_size
if batch_size % torch.cuda.device_count() != 0:
raise ValueError(f'Batch sizes ({batch_size} must be divided by'
f'the number of devices ({torch.cuda.device_count()})')
# Create dataset builders for tabular data.
train, test, cols = load_data(FLAGS.dataname)
cols_idx = list(np.arange(train.shape[1]))
dis_idx = cols[0]
con_idx = [x for x in cols_idx if x not in dis_idx]
#split continuous and categorical
train_con = train[:,con_idx]
train_dis = train[:,dis_idx]
#new index
cat_idx_ = list(np.arange(train_dis.shape[1]))[:len(cols[0])]
transformer_con = GeneralTransformer()
transformer_dis = GeneralTransformer()
transformer_con.fit(train_con, [])
transformer_dis.fit(train_dis, cat_idx_)
train_con_data = transformer_con.transform(train_con)
train_dis_data = transformer_dis.transform(train_dis)
return train, train_con_data, train_dis_data, test, (transformer_con, transformer_dis, cols[1]), con_idx, dis_idx