File size: 3,588 Bytes
7daf628 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | """Utility functions for pandas operations"""
from typing import List
import numpy as np
import pandas as pd
def apply_filters(df: pd.DataFrame, filters: dict, reset_index=False):
"""
Filters df based on given filters (key-values pairs).
"""
import omegaconf
X = df.copy()
all_indices = []
for col, values in filters.items():
if isinstance(values, (list, tuple, np.ndarray, omegaconf.listconfig.ListConfig)):
indices = X[col].isin(list(values))
else:
indices = X[col] == values
all_indices.append(indices)
# print(col, values, len(indices), sum(indices))
# X = X[indices]
if len(all_indices):
all_indices = np.array(all_indices)
indices = np.all(all_indices, axis=0)
X = X[indices]
if reset_index:
X = X.reset_index(drop=True)
return X
def apply_antifilters(df: pd.DataFrame, filters: dict, reset_index=False):
"""
Filters df removing rows for given filters (key-values pairs).
"""
X = df.copy()
for col, values in filters.items():
if isinstance(values, (list, tuple, np.ndarray)):
indices = X[col].isin(list(values))
else:
indices = X[col] == values
X = X[~indices]
if reset_index:
X = X.reset_index(drop=True)
return X
def custom_eval(x):
"""Splits string '["a", "b", "c"]' into ["a", "b", "c"]."""
if isinstance(x, str):
x = x.replace('[', '')
x = x.replace(']', '')
x = x.split(',')
x = [y.rstrip().lstrip() for y in x]
return x
else:
return ['NA']
def split_column_into_columns(df, column):
"""
For given df, splits `column` containing values like '["a", "b"]'
into one-hot subcolumns like a. b with `Yes`/`No` values.
"""
df[column] = df[column].apply(custom_eval)
unique_values = []
for i in range(len(df)):
index = df.index[i]
list_of_values = df.loc[index, column]
for x in list_of_values:
if (x != 'NA') and (x != ''):
df.at[index, x] = 'Yes'
if x not in unique_values:
unique_values.append(x)
df[unique_values] = df[unique_values].fillna('No')
df[f'any_{column}'] = df[unique_values].apply(
lambda x: 'Yes' if 'Yes' in list(x) else 'No', axis=1
)
return df
def custom_read_csv(path: str, columns_to_onehot: List) -> pd.DataFrame:
"""Custom CSV reader
Args:
path (str): path to .csv file
columns_to_onehot (List): list of columns to one-hotify
Returns:
pd.DataFrame: loaded df
"""
df = pd.read_csv(path)
for column in columns_to_onehot:
df = split_column_into_columns(df, column)
return df
def split_df(df, test_size=0.2):
from sklearn.model_selection import train_test_split
# split the dataframe into train and test sets
train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
# split the train set into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=test_size, random_state=42)
return train_df, val_df, test_df
def load_csv_with_progress_bar(path: str, chunksize: int = 10000) -> pd.DataFrame:
from tqdm import tqdm
import os
df = pd.concat(
[
chunk for chunk in tqdm(
pd.read_csv(path, chunksize=chunksize),
desc='Loading data',
total=os.path.getsize(path) // chunksize
)
]
)
return df
|