File size: 7,477 Bytes
17be6b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e542954
 
17be6b7
 
 
e542954
 
 
 
17be6b7
e542954
 
 
 
 
 
 
 
 
17be6b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
"""
Kit Composition Data Cleaner

This script converts the Kit_Composition_and_relation.csv file into a cleaned format
with line types according to the following rules:

1. Master Kits:
   - If appears only once (standalone master): line_type = "long line"
   - If appears multiple times: line_type = "" (empty/theoretical)

2. Sub Kits:
   - All sub kits get line_type = "long line"

3. Prepacks:
   - All prepacks get line_type = "miniload"

The output includes columns: kit_name, kit_description, kit_type, line_type
"""

import pandas as pd
import os
from typing import Tuple


def load_kit_composition_data(file_path: str) -> pd.DataFrame:
    """Load the Kit Composition and relation CSV file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows from {file_path}")
    return df


def process_master_kits(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process Master Kits according to business rules:
    - Standalone masters (no subkits/prepacks, only components): line_type = "long line"
    - Non-standalone masters (have subkits/prepacks): line_type = "" (empty - no production needed)
    """
    print("Processing Master Kits...")
    
    # Identify masters with hierarchy (subkits or prepacks)
    masters_with_subkits = set(df[df['Sub kit'].notna()]['Master Kit'].unique())
    masters_with_prepacks = set(df[df['Prepack'].notna()]['Master Kit'].unique())
    masters_with_hierarchy = masters_with_subkits.union(masters_with_prepacks)
    
    # All masters
    all_masters = set(df['Master Kit'].unique())
    
    # Standalone masters are those WITHOUT subkits/prepacks (only have components)
    standalone_masters = all_masters - masters_with_hierarchy
    
    print(f"Total unique Master Kits: {len(all_masters)}")
    print(f"Masters with subkits/prepacks: {len(masters_with_hierarchy)}")
    print(f"Standalone masters (only components): {len(standalone_masters)}")
    
    # Create master kit records
    master_data = []
    
    # Get unique master kits with descriptions
    unique_masters = df[['Master Kit', 'Master Kit  Description']].drop_duplicates()
    
    for _, row in unique_masters.iterrows():
        master_kit = row['Master Kit']
        master_desc = row['Master Kit  Description']
        
        # Determine line_type based on standalone status
        if master_kit in standalone_masters:
            line_type = "long line"
        else:
            line_type = ""  # Empty for non-standalone (theoretical)
        
        master_data.append({
            'kit_name': master_kit,
            'kit_description': master_desc,
            'kit_type': 'master',
            'line_type': line_type
        })
    
    master_df = pd.DataFrame(master_data)
    print(f"Created {len(master_df)} master kit records")
    print(f"Standalone masters with 'long line': {sum(master_df['line_type'] == 'long line')}")
    
    return master_df


def process_sub_kits(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process Sub Kits according to business rules:
    - All sub kits get line_type = "long line"
    - Remove duplicates
    """
    print("Processing Sub Kits...")
    
    # Filter rows that have sub kits
    subkit_df = df[df['Sub kit'].notna()].copy()
    
    if len(subkit_df) == 0:
        print("No sub kits found")
        return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
    
    # Get unique sub kits with descriptions
    unique_subkits = subkit_df[['Sub kit', 'Sub kit description']].drop_duplicates()
    
    subkit_data = []
    for _, row in unique_subkits.iterrows():
        subkit_data.append({
            'kit_name': row['Sub kit'],
            'kit_description': row['Sub kit description'],
            'kit_type': 'subkit',
            'line_type': 'long line'
        })
    
    subkit_result = pd.DataFrame(subkit_data)
    print(f"Created {len(subkit_result)} sub kit records")
    
    return subkit_result


def process_prepacks(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process Prepacks according to business rules:
    - All prepacks get line_type = "miniload"
    - Remove duplicates
    """
    print("Processing Prepacks...")
    
    # Filter rows that have prepacks
    prepack_df = df[df['Prepack'].notna()].copy()
    
    if len(prepack_df) == 0:
        print("No prepacks found")
        return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
    
    # Get unique prepacks with descriptions
    unique_prepacks = prepack_df[['Prepack', 'Prepack Description']].drop_duplicates()
    
    prepack_data = []
    for _, row in unique_prepacks.iterrows():
        prepack_data.append({
            'kit_name': row['Prepack'],
            'kit_description': row['Prepack Description'],
            'kit_type': 'prepack',
            'line_type': 'miniload'
        })
    
    prepack_result = pd.DataFrame(prepack_data)
    print(f"Created {len(prepack_result)} prepack records")
    
    return prepack_result


def concatenate_and_save(master_df: pd.DataFrame, subkit_df: pd.DataFrame, 
                        prepack_df: pd.DataFrame, output_path: str) -> pd.DataFrame:
    """
    Concatenate all processed dataframes and save to output file.
    """
    print("Concatenating results...")
    
    # Concatenate all dataframes
    final_df = pd.concat([master_df, subkit_df, prepack_df], ignore_index=True)
    
    # Ensure empty strings instead of NaN for line_type
    final_df['line_type'] = final_df['line_type'].fillna('')
    
    # Sort by kit_type for better organization
    final_df = final_df.sort_values(['kit_type', 'kit_name']).reset_index(drop=True)
    
    print(f"Final dataset contains {len(final_df)} records:")
    print(f"  - Masters: {len(master_df)}")
    print(f"  - Subkits: {len(subkit_df)}")
    print(f"  - Prepacks: {len(prepack_df)}")
    
    # Save to file (keep empty strings as empty, not NaN)
    final_df.to_csv(output_path, index=False, na_rep='')
    print(f"Saved cleaned data to: {output_path}")
    
    return final_df


def main():
    """Main function to execute the kit composition cleaning process."""
    # Define file paths
    base_dir = "/Users/halimjun/Coding_local/SD_roster_real"
    input_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation.csv")
    output_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv")
    
    try:
        # Load the original data
        df = load_kit_composition_data(input_file)
        
        # Process each type of kit
        master_df = process_master_kits(df)
        subkit_df = process_sub_kits(df)
        prepack_df = process_prepacks(df)
        
        # Concatenate and save
        final_df = concatenate_and_save(master_df, subkit_df, prepack_df, output_file)
        
        # Display summary statistics
        print("\n=== SUMMARY ===")
        print("Line type distribution:")
        print(final_df['line_type'].value_counts(dropna=False))
        print("\nKit type distribution:")
        print(final_df['kit_type'].value_counts())
        
        print("\nSample of final data:")
        print(final_df.head(10))
        
    except Exception as e:
        print(f"Error processing kit composition data: {e}")
        raise


if __name__ == "__main__":
    main()