|
|
""" |
|
|
Kit Composition Data Cleaner |
|
|
|
|
|
This script converts the Kit_Composition_and_relation.csv file into a cleaned format |
|
|
with line types according to the following rules: |
|
|
|
|
|
1. Master Kits: |
|
|
- If appears only once (standalone master): line_type = "long line" |
|
|
- If appears multiple times: line_type = "" (empty/theoretical) |
|
|
|
|
|
2. Sub Kits: |
|
|
- All sub kits get line_type = "long line" |
|
|
|
|
|
3. Prepacks: |
|
|
- All prepacks get line_type = "miniload" |
|
|
|
|
|
The output includes columns: kit_name, kit_description, kit_type, line_type |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import os |
|
|
from typing import Tuple |
|
|
|
|
|
|
|
|
def load_kit_composition_data(file_path: str) -> pd.DataFrame: |
|
|
"""Load the Kit Composition and relation CSV file.""" |
|
|
if not os.path.exists(file_path): |
|
|
raise FileNotFoundError(f"File not found: {file_path}") |
|
|
|
|
|
df = pd.read_csv(file_path) |
|
|
print(f"Loaded {len(df)} rows from {file_path}") |
|
|
return df |
|
|
|
|
|
|
|
|
def process_master_kits(df: pd.DataFrame) -> pd.DataFrame: |
|
|
""" |
|
|
Process Master Kits according to business rules: |
|
|
- Standalone masters (no subkits/prepacks, only components): line_type = "long line" |
|
|
- Non-standalone masters (have subkits/prepacks): line_type = "" (empty - no production needed) |
|
|
""" |
|
|
print("Processing Master Kits...") |
|
|
|
|
|
|
|
|
masters_with_subkits = set(df[df['Sub kit'].notna()]['Master Kit'].unique()) |
|
|
masters_with_prepacks = set(df[df['Prepack'].notna()]['Master Kit'].unique()) |
|
|
masters_with_hierarchy = masters_with_subkits.union(masters_with_prepacks) |
|
|
|
|
|
|
|
|
all_masters = set(df['Master Kit'].unique()) |
|
|
|
|
|
|
|
|
standalone_masters = all_masters - masters_with_hierarchy |
|
|
|
|
|
print(f"Total unique Master Kits: {len(all_masters)}") |
|
|
print(f"Masters with subkits/prepacks: {len(masters_with_hierarchy)}") |
|
|
print(f"Standalone masters (only components): {len(standalone_masters)}") |
|
|
|
|
|
|
|
|
master_data = [] |
|
|
|
|
|
|
|
|
unique_masters = df[['Master Kit', 'Master Kit Description']].drop_duplicates() |
|
|
|
|
|
for _, row in unique_masters.iterrows(): |
|
|
master_kit = row['Master Kit'] |
|
|
master_desc = row['Master Kit Description'] |
|
|
|
|
|
|
|
|
if master_kit in standalone_masters: |
|
|
line_type = "long line" |
|
|
else: |
|
|
line_type = "" |
|
|
|
|
|
master_data.append({ |
|
|
'kit_name': master_kit, |
|
|
'kit_description': master_desc, |
|
|
'kit_type': 'master', |
|
|
'line_type': line_type |
|
|
}) |
|
|
|
|
|
master_df = pd.DataFrame(master_data) |
|
|
print(f"Created {len(master_df)} master kit records") |
|
|
print(f"Standalone masters with 'long line': {sum(master_df['line_type'] == 'long line')}") |
|
|
|
|
|
return master_df |
|
|
|
|
|
|
|
|
def process_sub_kits(df: pd.DataFrame) -> pd.DataFrame: |
|
|
""" |
|
|
Process Sub Kits according to business rules: |
|
|
- All sub kits get line_type = "long line" |
|
|
- Remove duplicates |
|
|
""" |
|
|
print("Processing Sub Kits...") |
|
|
|
|
|
|
|
|
subkit_df = df[df['Sub kit'].notna()].copy() |
|
|
|
|
|
if len(subkit_df) == 0: |
|
|
print("No sub kits found") |
|
|
return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type']) |
|
|
|
|
|
|
|
|
unique_subkits = subkit_df[['Sub kit', 'Sub kit description']].drop_duplicates() |
|
|
|
|
|
subkit_data = [] |
|
|
for _, row in unique_subkits.iterrows(): |
|
|
subkit_data.append({ |
|
|
'kit_name': row['Sub kit'], |
|
|
'kit_description': row['Sub kit description'], |
|
|
'kit_type': 'subkit', |
|
|
'line_type': 'long line' |
|
|
}) |
|
|
|
|
|
subkit_result = pd.DataFrame(subkit_data) |
|
|
print(f"Created {len(subkit_result)} sub kit records") |
|
|
|
|
|
return subkit_result |
|
|
|
|
|
|
|
|
def process_prepacks(df: pd.DataFrame) -> pd.DataFrame: |
|
|
""" |
|
|
Process Prepacks according to business rules: |
|
|
- All prepacks get line_type = "miniload" |
|
|
- Remove duplicates |
|
|
""" |
|
|
print("Processing Prepacks...") |
|
|
|
|
|
|
|
|
prepack_df = df[df['Prepack'].notna()].copy() |
|
|
|
|
|
if len(prepack_df) == 0: |
|
|
print("No prepacks found") |
|
|
return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type']) |
|
|
|
|
|
|
|
|
unique_prepacks = prepack_df[['Prepack', 'Prepack Description']].drop_duplicates() |
|
|
|
|
|
prepack_data = [] |
|
|
for _, row in unique_prepacks.iterrows(): |
|
|
prepack_data.append({ |
|
|
'kit_name': row['Prepack'], |
|
|
'kit_description': row['Prepack Description'], |
|
|
'kit_type': 'prepack', |
|
|
'line_type': 'miniload' |
|
|
}) |
|
|
|
|
|
prepack_result = pd.DataFrame(prepack_data) |
|
|
print(f"Created {len(prepack_result)} prepack records") |
|
|
|
|
|
return prepack_result |
|
|
|
|
|
|
|
|
def concatenate_and_save(master_df: pd.DataFrame, subkit_df: pd.DataFrame, |
|
|
prepack_df: pd.DataFrame, output_path: str) -> pd.DataFrame: |
|
|
""" |
|
|
Concatenate all processed dataframes and save to output file. |
|
|
""" |
|
|
print("Concatenating results...") |
|
|
|
|
|
|
|
|
final_df = pd.concat([master_df, subkit_df, prepack_df], ignore_index=True) |
|
|
|
|
|
|
|
|
final_df['line_type'] = final_df['line_type'].fillna('') |
|
|
|
|
|
|
|
|
final_df = final_df.sort_values(['kit_type', 'kit_name']).reset_index(drop=True) |
|
|
|
|
|
print(f"Final dataset contains {len(final_df)} records:") |
|
|
print(f" - Masters: {len(master_df)}") |
|
|
print(f" - Subkits: {len(subkit_df)}") |
|
|
print(f" - Prepacks: {len(prepack_df)}") |
|
|
|
|
|
|
|
|
final_df.to_csv(output_path, index=False, na_rep='') |
|
|
print(f"Saved cleaned data to: {output_path}") |
|
|
|
|
|
return final_df |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function to execute the kit composition cleaning process.""" |
|
|
|
|
|
base_dir = "/Users/halimjun/Coding_local/SD_roster_real" |
|
|
input_file = os.path.join(base_dir, "s3://supply-roster-optimization/Kit_Composition_and_relation.csv") |
|
|
output_file = os.path.join(base_dir, "s3://supply-roster-optimization/Kit_Composition_and_relation_cleaned_with_line_type.csv") |
|
|
|
|
|
try: |
|
|
|
|
|
df = load_kit_composition_data(input_file) |
|
|
|
|
|
|
|
|
master_df = process_master_kits(df) |
|
|
subkit_df = process_sub_kits(df) |
|
|
prepack_df = process_prepacks(df) |
|
|
|
|
|
|
|
|
final_df = concatenate_and_save(master_df, subkit_df, prepack_df, output_file) |
|
|
|
|
|
|
|
|
print("\n=== SUMMARY ===") |
|
|
print("Line type distribution:") |
|
|
print(final_df['line_type'].value_counts(dropna=False)) |
|
|
print("\nKit type distribution:") |
|
|
print(final_df['kit_type'].value_counts()) |
|
|
|
|
|
print("\nSample of final data:") |
|
|
print(final_df.head(10)) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing kit composition data: {e}") |
|
|
raise |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|