import pandas as pd
import tomllib

value_mapping = {
    'his_SEX':          {'female': 0, 'male': 1},
    'his_HISPANIC':     {'no': 0, 'yes': 1},
    'his_NACCNIHR':     {'whi': 0, 'blk': 1, 'asi': 2, 'ind': 3, 'haw': 4, 'mul': 5},
}

label_names = ['amy_label', 'tau_label']

class CSVDataset:

    def __init__(self, dat_file, cnf_file):
        ''' ... '''
        # load data csv
        df = pd.read_csv(dat_file)

        # value mapping
        # for col, mapping in value_mapping.items():
        #     df[col] = df[col].replace(mapping)

        # load toml file to get feature names
        # with open(cnf_file, 'rb') as file:
        #     feature_names = tomllib.load(file)['feature'].keys()
        
        cnf = pd.read_csv(cnf_file)
        expected_features = [col for col in list(cnf['Name']) if col not in label_names]
        
        # Only use features that exist in both the config and the data
        available_features = [col for col in expected_features if col in df.columns]
        missing_features = [col for col in expected_features if col not in df.columns]
        
        if missing_features:
            print(f"Warning: {len(missing_features)} features missing from data file:")
            print(f"Missing: {missing_features[:10]}...")
        
        print(f"Using {len(available_features)} out of {len(expected_features)} expected features")
        
        self.df = df
        self.df_features = df[available_features]
        self.df_labels = df[label_names] if all(col in df.columns for col in label_names) else pd.DataFrame()
        
    def __len__(self):
        ''' ... '''
        return len(self.df)

    def __getitem__(self, idx):
        ''' ... '''
        row = self.df_features.iloc[idx]
        clean_row = row.dropna()
        feature_dict = clean_row.to_dict()

        row = self.df_labels.iloc[idx]
        clean_row = row.dropna()
        label_dict = clean_row.to_dict()

        return feature_dict, label_dict

if __name__ == '__main__':
    # load dataset
    dset = CSVDataset(
    dat_file = "./test.csv", 
    cnf_file = "./input_meta_info.csv"
)
    print(dset[1])