haileyhalimj@gmail.com
Rename src/etl to src/preprocess for better code organization
1131bea
raw
history blame
7.37 kB
import pandas as pd
import datetime
from datetime import date, timedelta
import json
import os
# Default dates - will be overridden by optimization_config.py
START_DATE = pd.Timestamp(2025, 7, 7)
END_DATE = pd.Timestamp(2025, 7, 11)
def set_global_dates(start_date, end_date):
"""Update global START_DATE and END_DATE variables"""
global START_DATE, END_DATE
START_DATE = pd.Timestamp(start_date)
END_DATE = pd.Timestamp(end_date)
print(f"Updated global dates: {START_DATE} to {END_DATE}")
def read_excel(path: str) -> pd.DataFrame:
return pd.read_excel(path, dtype={"id": "Int64"})
def read_demand_data(
path="data/real_data_excel/converted_csv/COOIS_Planned_and_Released.csv",
start_date=None,
end_date=None,
) -> pd.DataFrame:
df = pd.read_csv(path)
df["Basic start date"] = pd.to_datetime(df["Basic start date"])
# df["Basic finish date"] = pd.to_datetime(df["Basic finish date"])
# Use provided dates or fall back to module defaults
filter_start_date = start_date if start_date is not None else START_DATE
filter_end_date = end_date if end_date is not None else END_DATE
df = df[(df["Basic start date"] == filter_start_date)]
return df
def read_kit_line_match_data(
path="data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv",
) -> pd.DataFrame:
return pd.read_csv(path)
def read_employee_data(
path="data/real_data_excel/converted_csv/WH_Workforce_Hourly_Pay_Scale_processed.csv",
) -> pd.DataFrame:
return pd.read_csv(path)
def get_shift_info(
path = "data/real_data_excel/converted_csv/work_shift.csv"
) -> pd.DataFrame:
df = pd.read_csv(path)
return df
def read_shift_cost_data(
path="data/real_data_excel/converted_csv/WH_Workforce_Hourly_Pay_Scale_processed.csv",
) -> pd.DataFrame:
return pd.read_csv(path)
def read_work_center_capacity(
path="data/real_data_excel/converted_csv/Work_Centre_Capacity.csv",
) -> pd.DataFrame:
return pd.read_csv(path)
def read_material_master(
path="data/real_data_excel/converted_csv/Material_Master_WMS.csv",
) -> pd.DataFrame:
return pd.read_csv(path)
def read_packaging_line_data(
path="data/real_data_excel/converted_csv/Work_Centre_Capacity_processed.csv",
) -> pd.DataFrame:
df = pd.read_csv(path)
# Filter for packaging lines only
df = df[df["line_for_packaging"] == True]
return df
def read_orders_data(
path="data/real_data_excel/converted_csv/COOIS_Planned_and_Released.csv",
start_date=None,
# end_date=None,
) -> pd.DataFrame:
"""
COOIS_Released_Prod_Orders.csv
Args:
path: path to the csv file
start_date: start date (pd.Timestamp or datetime)
Returns:
pd.DataFrame: filtered dataframe by date
"""
df = pd.read_csv(path)
assert len(df) > 0, "No data found in the file"
# convert date column to datetime
df["Basic start date"] = pd.to_datetime(df["Basic start date"])
# df["Basic finish date"] = pd.to_datetime(df["Basic finish date"])
# filter by date
if start_date is not None: # Filter for exact start date only
df = df[df["Basic start date"] == pd.to_datetime(start_date)]
else:
raise ValueError("start_date is required")
return df
def read_package_speed_data(
path="data/real_data_excel/converted_csv/Kits__Calculation.csv",
):
df = pd.read_csv(path, usecols=["Kit", "Kit per day","Paid work hours per day"])
df["Kit per day"] = df["Kit per day"].astype(float)
df["Paid work hours per day"] = df["Paid work hours per day"].astype(float)
df["Kit"] = df["Kit"].astype(str)
df['kits_per_hour'] = df['Kit per day']/df['Paid work hours per day']
speeds_per_hour = dict(zip(df["Kit"], df["kits_per_hour"]))
return speeds_per_hour
def read_personnel_requirement_data(
path="data/real_data_excel/converted_csv/Kits__Calculation.csv",
):
df = pd.read_csv(path, usecols=["Kit", "Humanizer", "UNICEF staff"])
# Clean the data by handling special whitespace characters like \xa0 (non-breaking space)
def clean_and_convert_to_float(value):
if pd.isna(value):
return 0.0
# Convert to string and strip all kinds of whitespace (including \xa0)
clean_value = str(value).strip()
# If empty after stripping, return 0
if clean_value == '' or clean_value == 'nan':
return 0.0
try:
return float(clean_value)
except ValueError as e:
print(f"Warning: Could not convert '{repr(value)}' to float, setting to 0. Error: {e}")
return 0.0
df["Humanizer"] = df["Humanizer"].apply(clean_and_convert_to_float)
df["UNICEF staff"] = df["UNICEF staff"].apply(clean_and_convert_to_float)
df["Kit"] = df["Kit"].astype(str)
return df
def get_production_order_data():
"""
Extract production order information from hierarchy.
Returns:
- kit_levels: {kit_id: level} where level 0=prepack, 1=subkit, 2=master
- dependencies: {kit_id: [dependency_list]}
- priority_order: [kit_ids] sorted by production priority
"""
path = "data/hierarchy_exports/kit_hierarchy.json"
with open(path, 'r', encoding='utf-8') as f:
hierarchy = json.load(f)
kit_levels = {}
dependencies = {}
# Process hierarchy to extract levels and dependencies
for master_id, master_data in hierarchy.items():
# Master kits are level 2
kit_levels[master_id] = 2
dependencies[master_id] = master_data.get('dependencies', [])
# Process subkits (level 1)
for subkit_id, subkit_data in master_data.get('subkits', {}).items():
kit_levels[subkit_id] = 1
dependencies[subkit_id] = subkit_data.get('dependencies', [])
# Process prepacks under subkits (level 0)
for prepack_id in subkit_data.get('prepacks', []):
if prepack_id not in kit_levels: # Avoid overwriting if already exists
kit_levels[prepack_id] = 0
dependencies[prepack_id] = []
# Process direct prepacks under master (level 0)
for prepack_id in master_data.get('direct_prepacks', []):
if prepack_id not in kit_levels: # Avoid overwriting if already exists
kit_levels[prepack_id] = 0
dependencies[prepack_id] = []
# Create priority order: prepacks first, then subkits, then masters
priority_order = []
# Level 0: Prepacks (highest priority)
prepacks = [kit for kit, level in kit_levels.items() if level == 0]
priority_order.extend(sorted(prepacks))
# Level 1: Subkits (medium priority)
subkits = [kit for kit, level in kit_levels.items() if level == 1]
priority_order.extend(sorted(subkits))
# Level 2: Masters (lowest priority)
masters = [kit for kit, level in kit_levels.items() if level == 2]
priority_order.extend(sorted(masters))
return kit_levels, dependencies, priority_order
if __name__ == "__main__":
employee_data = read_employee_data()
print("employee data")
print(employee_data)
print("line speed data",read_package_speed_data())