HaLim commited on
Commit
17be6b7
·
1 Parent(s): 7181629

kit composition data cleaner to generate Kit_Composition_and_relation_cleaned_with_line_type

Browse files
Files changed (1) hide show
  1. src/utils/kit_composition_cleaner.py +209 -0
src/utils/kit_composition_cleaner.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Kit Composition Data Cleaner
3
+
4
+ This script converts the Kit_Composition_and_relation.csv file into a cleaned format
5
+ with line types according to the following rules:
6
+
7
+ 1. Master Kits:
8
+ - If appears only once (standalone master): line_type = "long line"
9
+ - If appears multiple times: line_type = "" (empty/theoretical)
10
+
11
+ 2. Sub Kits:
12
+ - All sub kits get line_type = "long line"
13
+
14
+ 3. Prepacks:
15
+ - All prepacks get line_type = "miniload"
16
+
17
+ The output includes columns: kit_name, kit_description, kit_type, line_type
18
+ """
19
+
20
+ import pandas as pd
21
+ import os
22
+ from typing import Tuple
23
+
24
+
25
+ def load_kit_composition_data(file_path: str) -> pd.DataFrame:
26
+ """Load the Kit Composition and relation CSV file."""
27
+ if not os.path.exists(file_path):
28
+ raise FileNotFoundError(f"File not found: {file_path}")
29
+
30
+ df = pd.read_csv(file_path)
31
+ print(f"Loaded {len(df)} rows from {file_path}")
32
+ return df
33
+
34
+
35
+ def process_master_kits(df: pd.DataFrame) -> pd.DataFrame:
36
+ """
37
+ Process Master Kits according to business rules:
38
+ - Standalone masters (appear only once): line_type = "long line"
39
+ - Non-standalone masters: line_type = "" (empty)
40
+ """
41
+ print("Processing Master Kits...")
42
+
43
+ # Get master kit counts to identify standalone masters
44
+ master_counts = df['Master Kit'].value_counts()
45
+ standalone_masters = set(master_counts[master_counts == 1].index)
46
+
47
+ print(f"Total unique Master Kits: {len(master_counts)}")
48
+ print(f"Standalone masters (appear only once): {len(standalone_masters)}")
49
+
50
+ # Create master kit records
51
+ master_data = []
52
+
53
+ # Get unique master kits with descriptions
54
+ unique_masters = df[['Master Kit', 'Master Kit Description']].drop_duplicates()
55
+
56
+ for _, row in unique_masters.iterrows():
57
+ master_kit = row['Master Kit']
58
+ master_desc = row['Master Kit Description']
59
+
60
+ # Determine line_type based on standalone status
61
+ if master_kit in standalone_masters:
62
+ line_type = "long line"
63
+ else:
64
+ line_type = "" # Empty for non-standalone (theoretical)
65
+
66
+ master_data.append({
67
+ 'kit_name': master_kit,
68
+ 'kit_description': master_desc,
69
+ 'kit_type': 'master',
70
+ 'line_type': line_type
71
+ })
72
+
73
+ master_df = pd.DataFrame(master_data)
74
+ print(f"Created {len(master_df)} master kit records")
75
+ print(f"Standalone masters with 'long line': {sum(master_df['line_type'] == 'long line')}")
76
+
77
+ return master_df
78
+
79
+
80
+ def process_sub_kits(df: pd.DataFrame) -> pd.DataFrame:
81
+ """
82
+ Process Sub Kits according to business rules:
83
+ - All sub kits get line_type = "long line"
84
+ - Remove duplicates
85
+ """
86
+ print("Processing Sub Kits...")
87
+
88
+ # Filter rows that have sub kits
89
+ subkit_df = df[df['Sub kit'].notna()].copy()
90
+
91
+ if len(subkit_df) == 0:
92
+ print("No sub kits found")
93
+ return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
94
+
95
+ # Get unique sub kits with descriptions
96
+ unique_subkits = subkit_df[['Sub kit', 'Sub kit description']].drop_duplicates()
97
+
98
+ subkit_data = []
99
+ for _, row in unique_subkits.iterrows():
100
+ subkit_data.append({
101
+ 'kit_name': row['Sub kit'],
102
+ 'kit_description': row['Sub kit description'],
103
+ 'kit_type': 'subkit',
104
+ 'line_type': 'long line'
105
+ })
106
+
107
+ subkit_result = pd.DataFrame(subkit_data)
108
+ print(f"Created {len(subkit_result)} sub kit records")
109
+
110
+ return subkit_result
111
+
112
+
113
+ def process_prepacks(df: pd.DataFrame) -> pd.DataFrame:
114
+ """
115
+ Process Prepacks according to business rules:
116
+ - All prepacks get line_type = "miniload"
117
+ - Remove duplicates
118
+ """
119
+ print("Processing Prepacks...")
120
+
121
+ # Filter rows that have prepacks
122
+ prepack_df = df[df['Prepack'].notna()].copy()
123
+
124
+ if len(prepack_df) == 0:
125
+ print("No prepacks found")
126
+ return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
127
+
128
+ # Get unique prepacks with descriptions
129
+ unique_prepacks = prepack_df[['Prepack', 'Prepack Description']].drop_duplicates()
130
+
131
+ prepack_data = []
132
+ for _, row in unique_prepacks.iterrows():
133
+ prepack_data.append({
134
+ 'kit_name': row['Prepack'],
135
+ 'kit_description': row['Prepack Description'],
136
+ 'kit_type': 'prepack',
137
+ 'line_type': 'miniload'
138
+ })
139
+
140
+ prepack_result = pd.DataFrame(prepack_data)
141
+ print(f"Created {len(prepack_result)} prepack records")
142
+
143
+ return prepack_result
144
+
145
+
146
+ def concatenate_and_save(master_df: pd.DataFrame, subkit_df: pd.DataFrame,
147
+ prepack_df: pd.DataFrame, output_path: str) -> pd.DataFrame:
148
+ """
149
+ Concatenate all processed dataframes and save to output file.
150
+ """
151
+ print("Concatenating results...")
152
+
153
+ # Concatenate all dataframes
154
+ final_df = pd.concat([master_df, subkit_df, prepack_df], ignore_index=True)
155
+
156
+ # Ensure empty strings instead of NaN for line_type
157
+ final_df['line_type'] = final_df['line_type'].fillna('')
158
+
159
+ # Sort by kit_type for better organization
160
+ final_df = final_df.sort_values(['kit_type', 'kit_name']).reset_index(drop=True)
161
+
162
+ print(f"Final dataset contains {len(final_df)} records:")
163
+ print(f" - Masters: {len(master_df)}")
164
+ print(f" - Subkits: {len(subkit_df)}")
165
+ print(f" - Prepacks: {len(prepack_df)}")
166
+
167
+ # Save to file (keep empty strings as empty, not NaN)
168
+ final_df.to_csv(output_path, index=False, na_rep='')
169
+ print(f"Saved cleaned data to: {output_path}")
170
+
171
+ return final_df
172
+
173
+
174
+ def main():
175
+ """Main function to execute the kit composition cleaning process."""
176
+ # Define file paths
177
+ base_dir = "/Users/halimjun/Coding_local/SD_roster_real"
178
+ input_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation.csv")
179
+ output_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv")
180
+
181
+ try:
182
+ # Load the original data
183
+ df = load_kit_composition_data(input_file)
184
+
185
+ # Process each type of kit
186
+ master_df = process_master_kits(df)
187
+ subkit_df = process_sub_kits(df)
188
+ prepack_df = process_prepacks(df)
189
+
190
+ # Concatenate and save
191
+ final_df = concatenate_and_save(master_df, subkit_df, prepack_df, output_file)
192
+
193
+ # Display summary statistics
194
+ print("\n=== SUMMARY ===")
195
+ print("Line type distribution:")
196
+ print(final_df['line_type'].value_counts(dropna=False))
197
+ print("\nKit type distribution:")
198
+ print(final_df['kit_type'].value_counts())
199
+
200
+ print("\nSample of final data:")
201
+ print(final_df.head(10))
202
+
203
+ except Exception as e:
204
+ print(f"Error processing kit composition data: {e}")
205
+ raise
206
+
207
+
208
+ if __name__ == "__main__":
209
+ main()