""" Kit Composition Data Cleaner This script converts the Kit_Composition_and_relation.csv file into a cleaned format with line types according to the following rules: 1. Master Kits: - If appears only once (standalone master): line_type = "long line" - If appears multiple times: line_type = "" (empty/theoretical) 2. Sub Kits: - All sub kits get line_type = "long line" 3. Prepacks: - All prepacks get line_type = "miniload" The output includes columns: kit_name, kit_description, kit_type, line_type """ import pandas as pd import os from typing import Tuple def load_kit_composition_data(file_path: str) -> pd.DataFrame: """Load the Kit Composition and relation CSV file.""" if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") df = pd.read_csv(file_path) print(f"Loaded {len(df)} rows from {file_path}") return df def process_master_kits(df: pd.DataFrame) -> pd.DataFrame: """ Process Master Kits according to business rules: - Standalone masters (no subkits/prepacks, only components): line_type = "long line" - Non-standalone masters (have subkits/prepacks): line_type = "" (empty - no production needed) """ print("Processing Master Kits...") # Identify masters with hierarchy (subkits or prepacks) masters_with_subkits = set(df[df['Sub kit'].notna()]['Master Kit'].unique()) masters_with_prepacks = set(df[df['Prepack'].notna()]['Master Kit'].unique()) masters_with_hierarchy = masters_with_subkits.union(masters_with_prepacks) # All masters all_masters = set(df['Master Kit'].unique()) # Standalone masters are those WITHOUT subkits/prepacks (only have components) standalone_masters = all_masters - masters_with_hierarchy print(f"Total unique Master Kits: {len(all_masters)}") print(f"Masters with subkits/prepacks: {len(masters_with_hierarchy)}") print(f"Standalone masters (only components): {len(standalone_masters)}") # Create master kit records master_data = [] # Get unique master kits with descriptions unique_masters = df[['Master Kit', 'Master Kit Description']].drop_duplicates() for _, row in unique_masters.iterrows(): master_kit = row['Master Kit'] master_desc = row['Master Kit Description'] # Determine line_type based on standalone status if master_kit in standalone_masters: line_type = "long line" else: line_type = "" # Empty for non-standalone (theoretical) master_data.append({ 'kit_name': master_kit, 'kit_description': master_desc, 'kit_type': 'master', 'line_type': line_type }) master_df = pd.DataFrame(master_data) print(f"Created {len(master_df)} master kit records") print(f"Standalone masters with 'long line': {sum(master_df['line_type'] == 'long line')}") return master_df def process_sub_kits(df: pd.DataFrame) -> pd.DataFrame: """ Process Sub Kits according to business rules: - All sub kits get line_type = "long line" - Remove duplicates """ print("Processing Sub Kits...") # Filter rows that have sub kits subkit_df = df[df['Sub kit'].notna()].copy() if len(subkit_df) == 0: print("No sub kits found") return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type']) # Get unique sub kits with descriptions unique_subkits = subkit_df[['Sub kit', 'Sub kit description']].drop_duplicates() subkit_data = [] for _, row in unique_subkits.iterrows(): subkit_data.append({ 'kit_name': row['Sub kit'], 'kit_description': row['Sub kit description'], 'kit_type': 'subkit', 'line_type': 'long line' }) subkit_result = pd.DataFrame(subkit_data) print(f"Created {len(subkit_result)} sub kit records") return subkit_result def process_prepacks(df: pd.DataFrame) -> pd.DataFrame: """ Process Prepacks according to business rules: - All prepacks get line_type = "miniload" - Remove duplicates """ print("Processing Prepacks...") # Filter rows that have prepacks prepack_df = df[df['Prepack'].notna()].copy() if len(prepack_df) == 0: print("No prepacks found") return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type']) # Get unique prepacks with descriptions unique_prepacks = prepack_df[['Prepack', 'Prepack Description']].drop_duplicates() prepack_data = [] for _, row in unique_prepacks.iterrows(): prepack_data.append({ 'kit_name': row['Prepack'], 'kit_description': row['Prepack Description'], 'kit_type': 'prepack', 'line_type': 'miniload' }) prepack_result = pd.DataFrame(prepack_data) print(f"Created {len(prepack_result)} prepack records") return prepack_result def concatenate_and_save(master_df: pd.DataFrame, subkit_df: pd.DataFrame, prepack_df: pd.DataFrame, output_path: str) -> pd.DataFrame: """ Concatenate all processed dataframes and save to output file. """ print("Concatenating results...") # Concatenate all dataframes final_df = pd.concat([master_df, subkit_df, prepack_df], ignore_index=True) # Ensure empty strings instead of NaN for line_type final_df['line_type'] = final_df['line_type'].fillna('') # Sort by kit_type for better organization final_df = final_df.sort_values(['kit_type', 'kit_name']).reset_index(drop=True) print(f"Final dataset contains {len(final_df)} records:") print(f" - Masters: {len(master_df)}") print(f" - Subkits: {len(subkit_df)}") print(f" - Prepacks: {len(prepack_df)}") # Save to file (keep empty strings as empty, not NaN) final_df.to_csv(output_path, index=False, na_rep='') print(f"Saved cleaned data to: {output_path}") return final_df def main(): """Main function to execute the kit composition cleaning process.""" # Define file paths base_dir = "/Users/halimjun/Coding_local/SD_roster_real" input_file = os.path.join(base_dir, "s3://supply-roster-optimization/Kit_Composition_and_relation.csv") output_file = os.path.join(base_dir, "s3://supply-roster-optimization/Kit_Composition_and_relation_cleaned_with_line_type.csv") try: # Load the original data df = load_kit_composition_data(input_file) # Process each type of kit master_df = process_master_kits(df) subkit_df = process_sub_kits(df) prepack_df = process_prepacks(df) # Concatenate and save final_df = concatenate_and_save(master_df, subkit_df, prepack_df, output_file) # Display summary statistics print("\n=== SUMMARY ===") print("Line type distribution:") print(final_df['line_type'].value_counts(dropna=False)) print("\nKit type distribution:") print(final_df['kit_type'].value_counts()) print("\nSample of final data:") print(final_df.head(10)) except Exception as e: print(f"Error processing kit composition data: {e}") raise if __name__ == "__main__": main()