import pandas as pd import numpy as np from datetime import timedelta from datetime import datetime from sklearn.preprocessing import LabelEncoder import pickle import json import os def calculate_days_since_earliest_date(dates): date_objects = [datetime.strptime(date, '%y%m%d') for date in dates] earliest_date = min(date_objects) days_since = [(date - earliest_date).days for date in date_objects] return days_since, earliest_date.strftime('%y%m%d') def reconstruct_dates(days_since, earliest_date_str): earliest_date = datetime.strptime(earliest_date_str, '%y%m%d') original_dates = [(earliest_date + timedelta(days=days)).strftime('%y%m%d') for days in days_since] return original_dates def birth_number_split(birth_numbers): years = [int(bn[:2]) for bn in birth_numbers] months = [int(bn[2:4]) for bn in birth_numbers] days = [int(bn[4:6]) for bn in birth_numbers] genders = [] for i in range(len(months)): if months[i] >= 50: months[i] -= 50 genders.append(1) else: genders.append(0) return years, months, days, genders def table_label_encode(df, discrete_cols): df = df.copy() label_encoders = {} for col in discrete_cols: le = LabelEncoder() df[col] = le.fit_transform(df[col]) label_encoders[col] = le return df, label_encoders def table_label_decode(df, label_encoders): df = df.copy() for col, le in label_encoders.items(): df[col] = le.inverse_transform(df[col]) return df def get_domain(df, id_cols, discrete_cols): domain = {} for col in df.columns: if col in discrete_cols: domain[col] = { 'size': len(df[col].unique()), 'type': 'discrete' } elif col not in id_cols: domain[col] = { 'size': len(df[col].unique()), 'type': 'continuous' } return domain def encode_and_save(df, discrete_cols, keys, save_dir, table_name): df_encoded, df_label_encoders = table_label_encode(df, discrete_cols) df_encoded = df_encoded.astype('str') df_encoded.to_csv(os.path.join(save_dir, f'{table_name}.csv'), index=False) with open(os.path.join(save_dir, f'{table_name}_label_encoders.pkl'), 'wb') as f: pickle.dump(df_label_encoders, f) df_domain = get_domain(df_encoded, keys, discrete_cols) with open(os.path.join(save_dir, f'{table_name}_domain.json'), 'w') as f: json.dump(df_domain, f) def topological_sort(graph): # Initialize the indegree map and output in_degree = {node: 0 for node in graph} for node in graph: for child in graph[node]['children']: in_degree[child] += 1 # Queue for nodes with no incoming edges zero_in_degree = [node for node, degree in in_degree.items() if degree == 0] # Output list for storing the order sorted_order = [] # Start with root nodes and format them with None as parent for node in zero_in_degree: sorted_order.append([None, node]) # Using a queue to maintain nodes to process queue = zero_in_degree[:] while queue: current = queue.pop(0) for child in graph[current]['children']: in_degree[child] -= 1 if in_degree[child] == 0: queue.append(child) # Add each parent-child relationship as we process them sorted_order.append([current, child]) return sorted_order