Spaces:
Running
Running
| # coding=utf-8 | |
| # Copyright 2020 The Google Research Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # pylint: skip-file | |
| """Return training and evaluation/test datasets from config files.""" | |
| import torch | |
| import numpy as np | |
| import pandas as pd | |
| from tabular_transformer import GeneralTransformer | |
| import json | |
| import logging | |
| import os | |
| CATEGORICAL = "categorical" | |
| CONTINUOUS = "continuous" | |
| LOGGER = logging.getLogger(__name__) | |
| DATA_PATH = os.path.join(os.path.dirname(__file__), 'tabular_datasets') | |
| def _load_json(path): | |
| with open(path) as json_file: | |
| return json.load(json_file) | |
| def _load_file(filename, loader): | |
| local_path = os.path.join(DATA_PATH, filename) | |
| if loader == np.load: | |
| return loader(local_path, allow_pickle=True) | |
| return loader(local_path) | |
| def _get_columns(metadata): | |
| categorical_columns = list() | |
| for column_idx, column in enumerate(metadata['columns']): | |
| if column['type'] == CATEGORICAL: | |
| categorical_columns.append(column_idx) | |
| return categorical_columns | |
| def load_data(name): | |
| data_dir = f'data/{name}' | |
| info_path = f'{data_dir}/info.json' | |
| train = pd.read_csv(f'{data_dir}/train.csv').to_numpy() | |
| test = pd.read_csv(f'{data_dir}/test.csv').to_numpy() | |
| with open(f'{data_dir}/info.json', 'r') as f: | |
| info = json.load(f) | |
| task_type = info['task_type'] | |
| num_cols = info['num_col_idx'] | |
| cat_cols = info['cat_col_idx'] | |
| target_cols = info['target_col_idx'] | |
| if task_type != 'regression': | |
| cat_cols = cat_cols + target_cols | |
| return train, test, (cat_cols, info) | |
| def get_dataset(FLAGS, evaluation=False): | |
| batch_size = FLAGS.training_batch_size if not evaluation else FLAGS.eval_batch_size | |
| if batch_size % torch.cuda.device_count() != 0: | |
| raise ValueError(f'Batch sizes ({batch_size} must be divided by' | |
| f'the number of devices ({torch.cuda.device_count()})') | |
| # Create dataset builders for tabular data. | |
| train, test, cols = load_data(FLAGS.dataname) | |
| cols_idx = list(np.arange(train.shape[1])) | |
| dis_idx = cols[0] | |
| con_idx = [x for x in cols_idx if x not in dis_idx] | |
| #split continuous and categorical | |
| train_con = train[:,con_idx] | |
| train_dis = train[:,dis_idx] | |
| #new index | |
| cat_idx_ = list(np.arange(train_dis.shape[1]))[:len(cols[0])] | |
| transformer_con = GeneralTransformer() | |
| transformer_dis = GeneralTransformer() | |
| transformer_con.fit(train_con, []) | |
| transformer_dis.fit(train_dis, cat_idx_) | |
| train_con_data = transformer_con.transform(train_con) | |
| train_dis_data = transformer_dis.transform(train_dis) | |
| return train, train_con_data, train_dis_data, test, (transformer_con, transformer_dis, cols[1]), con_idx, dis_idx | |