haileyhalimj@gmail.com
Recover and restore preprocessing improvements from d54de4e
8504f5a
raw
history blame
6.65 kB
import pandas as pd
import datetime
from datetime import date, timedelta
import json
import os
import yaml
from pathlib import Path
# Load paths configuration
_config_dir = Path(__file__).parent.parent / "config"
_paths_file = _config_dir / "paths.yaml"
with open(_paths_file, 'r', encoding='utf-8') as f:
PATHS = yaml.safe_load(f)
def read_kit_line_match_data() -> pd.DataFrame:
"""Read kit composition and relation data"""
path = PATHS['data']['csv']['kit_composition']
return pd.read_csv(path)
def read_employee_data() -> pd.DataFrame:
"""Read employee workforce hourly pay scale data"""
path = PATHS['data']['csv']['workforce_pay_scale']
return pd.read_csv(path)
def get_shift_info() -> pd.DataFrame:
"""Read work shift information"""
path = PATHS['data']['csv']['work_shift']
df = pd.read_csv(path)
return df
def read_shift_cost_data() -> pd.DataFrame:
"""Read shift cost data from workforce pay scale"""
path = PATHS['data']['csv']['workforce_pay_scale']
return pd.read_csv(path)
def read_work_center_capacity() -> pd.DataFrame:
"""Read work center capacity data"""
path = PATHS['data']['csv']['work_center_capacity']
return pd.read_csv(path)
def read_material_master() -> pd.DataFrame:
"""Read material master WMS data"""
path = PATHS['data']['csv']['material_master']
return pd.read_csv(path)
def read_packaging_line_data() -> pd.DataFrame:
"""Read packaging line data (filtered work center capacity)"""
path = PATHS['data']['csv']['work_center_capacity_processed']
df = pd.read_csv(path)
# Filter for packaging lines only
df = df[df["line_for_packaging"] == True]
return df
def read_orders_data(
start_date=None,
# end_date=None,
) -> pd.DataFrame:
"""
Read COOIS Released Production Orders data
Args:
start_date: start date (pd.Timestamp or datetime)
Returns:
pd.DataFrame: filtered dataframe by date
"""
path = PATHS['data']['csv']['demand']
df = pd.read_csv(path)
assert len(df) > 0, "No data found in the file"
# convert date column to datetime
df["Basic start date"] = pd.to_datetime(df["Basic start date"])
# filter by date
if start_date is not None: # Filter for exact start date only
df = df[df["Basic start date"] == pd.to_datetime(start_date)]
else:
raise ValueError("start_date is required")
return df
def read_package_speed_data():
"""Read package speed data from Kits Calculation"""
path = PATHS['data']['csv']['kits_calculation']
df = pd.read_csv(path, usecols=["Kit", "Kit per day","Paid work hours per day"])
df["Kit per day"] = df["Kit per day"].astype(float)
df["Paid work hours per day"] = df["Paid work hours per day"].astype(float)
df["Kit"] = df["Kit"].astype(str)
df['kits_per_hour'] = df['Kit per day']/df['Paid work hours per day']
speeds_per_hour = dict(zip(df["Kit"], df["kits_per_hour"]))
return speeds_per_hour
def read_personnel_requirement_data():
"""Read personnel requirement data from Kits Calculation"""
path = PATHS['data']['csv']['kits_calculation']
df = pd.read_csv(path, usecols=["Kit", "Humanizer", "UNICEF staff"])
# Clean the data by handling special whitespace characters like \xa0 (non-breaking space)
def clean_and_convert_to_float(value):
if pd.isna(value):
return 0.0
# Convert to string and strip all kinds of whitespace (including \xa0)
clean_value = str(value).strip()
# If empty after stripping, return 0
if clean_value == '' or clean_value == 'nan':
return 0.0
try:
return float(clean_value)
except ValueError as e:
print(f"Warning: Could not convert '{repr(value)}' to float, setting to 0. Error: {e}")
return 0.0
df["Humanizer"] = df["Humanizer"].apply(clean_and_convert_to_float)
df["UNICEF staff"] = df["UNICEF staff"].apply(clean_and_convert_to_float)
df["Kit"] = df["Kit"].astype(str)
return df
def get_production_order_data():
"""
Extract production order information from hierarchy.
Returns:
tuple: (kit_levels, dependencies, priority_order)
- kit_levels: {kit_id: level} where level 0=prepack, 1=subkit, 2=master
- dependencies: {kit_id: [dependency_list]}
- priority_order: [kit_ids] sorted by production priority
"""
path = PATHS['data']['hierarchy']['kit_hierarchy']
with open(path, 'r', encoding='utf-8') as f:
hierarchy = json.load(f)
kit_levels = {}
dependencies = {}
# Process hierarchy to extract levels and dependencies
for master_id, master_data in hierarchy.items():
# Master kits are level 2
kit_levels[master_id] = 2
dependencies[master_id] = master_data.get('dependencies', [])
# Process subkits (level 1)
for subkit_id, subkit_data in master_data.get('subkits', {}).items():
kit_levels[subkit_id] = 1
dependencies[subkit_id] = subkit_data.get('dependencies', [])
# Process prepacks under subkits (level 0)
for prepack_id in subkit_data.get('prepacks', []):
if prepack_id not in kit_levels: # Avoid overwriting if already exists
kit_levels[prepack_id] = 0
dependencies[prepack_id] = []
# Process direct prepacks under master (level 0)
for prepack_id in master_data.get('direct_prepacks', []):
if prepack_id not in kit_levels: # Avoid overwriting if already exists
kit_levels[prepack_id] = 0
dependencies[prepack_id] = []
# Create priority order: prepacks first, then subkits, then masters
priority_order = []
# Level 0: Prepacks (highest priority)
prepacks = [kit for kit, level in kit_levels.items() if level == 0]
priority_order.extend(sorted(prepacks))
# Level 1: Subkits (medium priority)
subkits = [kit for kit, level in kit_levels.items() if level == 1]
priority_order.extend(sorted(subkits))
# Level 2: Masters (lowest priority)
masters = [kit for kit, level in kit_levels.items() if level == 2]
priority_order.extend(sorted(masters))
return kit_levels, dependencies, priority_order
if __name__ == "__main__":
employee_data = read_employee_data()
print("employee data")
print(employee_data)
print("line speed data",read_package_speed_data())