Spaces:
Running
Running
File size: 3,533 Bytes
c4ac745 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import pickle
import json
import os
def calculate_days_since_earliest_date(dates):
date_objects = [datetime.strptime(date, '%y%m%d') for date in dates]
earliest_date = min(date_objects)
days_since = [(date - earliest_date).days for date in date_objects]
return days_since, earliest_date.strftime('%y%m%d')
def reconstruct_dates(days_since, earliest_date_str):
earliest_date = datetime.strptime(earliest_date_str, '%y%m%d')
original_dates = [(earliest_date + timedelta(days=days)).strftime('%y%m%d') for days in days_since]
return original_dates
def birth_number_split(birth_numbers):
years = [int(bn[:2]) for bn in birth_numbers]
months = [int(bn[2:4]) for bn in birth_numbers]
days = [int(bn[4:6]) for bn in birth_numbers]
genders = []
for i in range(len(months)):
if months[i] >= 50:
months[i] -= 50
genders.append(1)
else:
genders.append(0)
return years, months, days, genders
def table_label_encode(df, discrete_cols):
df = df.copy()
label_encoders = {}
for col in discrete_cols:
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
label_encoders[col] = le
return df, label_encoders
def table_label_decode(df, label_encoders):
df = df.copy()
for col, le in label_encoders.items():
df[col] = le.inverse_transform(df[col])
return df
def get_domain(df, id_cols, discrete_cols):
domain = {}
for col in df.columns:
if col in discrete_cols:
domain[col] = {
'size': len(df[col].unique()),
'type': 'discrete'
}
elif col not in id_cols:
domain[col] = {
'size': len(df[col].unique()),
'type': 'continuous'
}
return domain
def encode_and_save(df, discrete_cols, keys, save_dir, table_name):
df_encoded, df_label_encoders = table_label_encode(df, discrete_cols)
df_encoded = df_encoded.astype('str')
df_encoded.to_csv(os.path.join(save_dir, f'{table_name}.csv'), index=False)
with open(os.path.join(save_dir, f'{table_name}_label_encoders.pkl'), 'wb') as f:
pickle.dump(df_label_encoders, f)
df_domain = get_domain(df_encoded, keys, discrete_cols)
with open(os.path.join(save_dir, f'{table_name}_domain.json'), 'w') as f:
json.dump(df_domain, f)
def topological_sort(graph):
# Initialize the indegree map and output
in_degree = {node: 0 for node in graph}
for node in graph:
for child in graph[node]['children']:
in_degree[child] += 1
# Queue for nodes with no incoming edges
zero_in_degree = [node for node, degree in in_degree.items() if degree == 0]
# Output list for storing the order
sorted_order = []
# Start with root nodes and format them with None as parent
for node in zero_in_degree:
sorted_order.append([None, node])
# Using a queue to maintain nodes to process
queue = zero_in_degree[:]
while queue:
current = queue.pop(0)
for child in graph[current]['children']:
in_degree[child] -= 1
if in_degree[child] == 0:
queue.append(child)
# Add each parent-child relationship as we process them
sorted_order.append([current, child])
return sorted_order
|