# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# pylint: skip-file
"""Return training and evaluation/test datasets from config files."""
import torch
import numpy as np
import pandas as pd
from tabular_transformer import GeneralTransformer
import json
import logging
import os

CATEGORICAL = "categorical"
CONTINUOUS = "continuous"

LOGGER = logging.getLogger(__name__)

DATA_PATH = os.path.join(os.path.dirname(__file__), 'tabular_datasets')

def _load_json(path):
    with open(path) as json_file:
        return json.load(json_file)


def _load_file(filename, loader):
    local_path = os.path.join(DATA_PATH, filename)
    
    if loader == np.load:
        return loader(local_path, allow_pickle=True)
    return loader(local_path)


def _get_columns(metadata):
    categorical_columns = list()

    for column_idx, column in enumerate(metadata['columns']):
        if column['type'] == CATEGORICAL:
            categorical_columns.append(column_idx)

    return categorical_columns


def load_data(name):
    data_dir = f'data/{name}'
    info_path = f'{data_dir}/info.json'

    train = pd.read_csv(f'{data_dir}/train.csv').to_numpy()
    test = pd.read_csv(f'{data_dir}/test.csv').to_numpy()

    with open(f'{data_dir}/info.json', 'r') as f:
        info = json.load(f)

    task_type = info['task_type']

    num_cols = info['num_col_idx']
    cat_cols = info['cat_col_idx']
    target_cols = info['target_col_idx']

    if task_type != 'regression':
        cat_cols = cat_cols + target_cols

    return train, test, (cat_cols, info)
        

def get_dataset(FLAGS, evaluation=False):

    batch_size = FLAGS.training_batch_size if not evaluation else FLAGS.eval_batch_size

    if batch_size % torch.cuda.device_count() != 0:
        raise ValueError(f'Batch sizes ({batch_size} must be divided by'
                            f'the number of devices ({torch.cuda.device_count()})')


    # Create dataset builders for tabular data.
    train, test, cols = load_data(FLAGS.dataname)
    cols_idx = list(np.arange(train.shape[1]))
    dis_idx = cols[0]
    con_idx = [x for x in cols_idx if x not in dis_idx]

    #split continuous and categorical
    train_con = train[:,con_idx]
    train_dis = train[:,dis_idx]

    #new index
    cat_idx_ = list(np.arange(train_dis.shape[1]))[:len(cols[0])]

    transformer_con = GeneralTransformer()
    transformer_dis = GeneralTransformer()

    transformer_con.fit(train_con, [])
    transformer_dis.fit(train_dis, cat_idx_)

    train_con_data = transformer_con.transform(train_con)
    train_dis_data = transformer_dis.transform(train_dis)


    return train, train_con_data, train_dis_data, test, (transformer_con, transformer_dis, cols[1]), con_idx, dis_idx