IRG / baselines /ClavaDDPM /preprocess_utils.py
Zilong-Zhao's picture
first commit
c4ac745
import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import pickle
import json
import os
def calculate_days_since_earliest_date(dates):
date_objects = [datetime.strptime(date, '%y%m%d') for date in dates]
earliest_date = min(date_objects)
days_since = [(date - earliest_date).days for date in date_objects]
return days_since, earliest_date.strftime('%y%m%d')
def reconstruct_dates(days_since, earliest_date_str):
earliest_date = datetime.strptime(earliest_date_str, '%y%m%d')
original_dates = [(earliest_date + timedelta(days=days)).strftime('%y%m%d') for days in days_since]
return original_dates
def birth_number_split(birth_numbers):
years = [int(bn[:2]) for bn in birth_numbers]
months = [int(bn[2:4]) for bn in birth_numbers]
days = [int(bn[4:6]) for bn in birth_numbers]
genders = []
for i in range(len(months)):
if months[i] >= 50:
months[i] -= 50
genders.append(1)
else:
genders.append(0)
return years, months, days, genders
def table_label_encode(df, discrete_cols):
df = df.copy()
label_encoders = {}
for col in discrete_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
return df, label_encoders
def table_label_decode(df, label_encoders):
df = df.copy()
for col, le in label_encoders.items():
df[col] = le.inverse_transform(df[col])
return df
def get_domain(df, id_cols, discrete_cols):
domain = {}
for col in df.columns:
if col in discrete_cols:
domain[col] = {
'size': len(df[col].unique()),
'type': 'discrete'
}
elif col not in id_cols:
domain[col] = {
'size': len(df[col].unique()),
'type': 'continuous'
}
return domain
def encode_and_save(df, discrete_cols, keys, save_dir, table_name):
df_encoded, df_label_encoders = table_label_encode(df, discrete_cols)
df_encoded = df_encoded.astype('str')
df_encoded.to_csv(os.path.join(save_dir, f'{table_name}.csv'), index=False)
with open(os.path.join(save_dir, f'{table_name}_label_encoders.pkl'), 'wb') as f:
pickle.dump(df_label_encoders, f)
df_domain = get_domain(df_encoded, keys, discrete_cols)
with open(os.path.join(save_dir, f'{table_name}_domain.json'), 'w') as f:
json.dump(df_domain, f)
def topological_sort(graph):
# Initialize the indegree map and output
in_degree = {node: 0 for node in graph}
for node in graph:
for child in graph[node]['children']:
in_degree[child] += 1
# Queue for nodes with no incoming edges
zero_in_degree = [node for node, degree in in_degree.items() if degree == 0]
# Output list for storing the order
sorted_order = []
# Start with root nodes and format them with None as parent
for node in zero_in_degree:
sorted_order.append([None, node])
# Using a queue to maintain nodes to process
queue = zero_in_degree[:]
while queue:
current = queue.pop(0)
for child in graph[current]['children']:
in_degree[child] -= 1
if in_degree[child] == 0:
queue.append(child)
# Add each parent-child relationship as we process them
sorted_order.append([current, child])
return sorted_order