haileyhalimj@gmail.com commited on
Commit
8504f5a
ยท
1 Parent(s): acd1110

Recover and restore preprocessing improvements from d54de4e

Browse files

Restored all preprocessing work that was done today:
- Add data_preprocess.py: Helper functions for data preprocessing
- Improve extract.py: Better data extraction logic (108 lines modified)
- Enhance kit_composition_cleaner.py: Major improvements (260 lines, 361 lines modified)
- Update transform.py: Better transformation logic (36 lines modified)
- Add paths.yaml: Configuration for data paths

Total: 505 lines changed across preprocessing module
This represents a full day's work on improving the preprocessing pipeline.

src/config/paths.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Paths Configuration
2
+ # All paths are relative to the project root directory
3
+
4
+ data:
5
+ # CSV data files
6
+ csv:
7
+ demand: "data/real_data_excel/converted_csv/COOIS_Planned_and_Released.csv"
8
+ kit_composition: "data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv"
9
+ workforce_pay_scale: "data/real_data_excel/converted_csv/WH_Workforce_Hourly_Pay_Scale_processed.csv"
10
+ work_shift: "data/real_data_excel/converted_csv/work_shift.csv"
11
+ work_center_capacity: "data/real_data_excel/converted_csv/Work_Centre_Capacity.csv"
12
+ work_center_capacity_processed: "data/real_data_excel/converted_csv/Work_Centre_Capacity_processed.csv"
13
+ material_master: "data/real_data_excel/converted_csv/Material_Master_WMS.csv"
14
+ kits_calculation: "data/real_data_excel/converted_csv/Kits__Calculation.csv"
15
+
16
+ # Hierarchy data
17
+ hierarchy:
18
+ kit_hierarchy: "data/hierarchy_exports/kit_hierarchy.json"
19
+
src/preprocess/data_preprocess.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def process_Kit_Composition_and_relation(output_csv_path: str = 'data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type_and_id.csv') -> pd.DataFrame:
5
+ """
6
+ Process the Kit_Composition_and_relation.csv file to clean the data and add line type and id.
7
+
8
+ Returns:
9
+ saves to csv path
10
+ cleaned_df: pd.DataFrame
11
+ """
12
+ df = pd.read_csv('data/real_data_excel/converted_csv/Kit_Composition_and_relation.csv')
13
+ # df.dropna(inplace=True)
14
+ master = df[["Master Kit", "Master Kit Description"]]
15
+ master["kit_type"] = "master"
16
+ master.rename(columns={"Master Kit": "kit_name", "Master Kit Description": "kit_description"}, inplace=True)
17
+
18
+ subkit = df[["Sub kit", "Sub kit description"]]
19
+ subkit["kit_type"] = "subkit"
20
+ subkit.rename(columns={"Sub kit": "kit_name", "Sub kit Description": "kit_description"}, inplace=True)
21
+ subkit.columns = ["kit_name", "kit_description", "kit_type"]
22
+
23
+ prepack = df[["Prepack", "Prepack Description"]]
24
+ prepack["kit_type"] = "prepack"
25
+ prepack.rename(columns={"Prepack": "kit_name", "Prepack Description": "kit_description"}, inplace=True)
26
+
27
+
28
+ cleaned_df = pd.concat([master, subkit, prepack])
29
+ cleaned_df[['kit_name','kit_description','kit_type']].drop_duplicates()
30
+ tmp = cleaned_df.groupby('kit_name').count()['kit_type'].reset_index()
31
+ standalone_masterkit_list = tmp.loc[tmp['kit_type']==1,'kit_name']
32
+
33
+ cleaned_df.loc[cleaned_df['kit_name'].isin(standalone_masterkit_list),'line_type'] = 'long line'
34
+ cleaned_df.loc[cleaned_df['kit_type']=='prepack','line_type'] = 'mini load'
35
+ cleaned_df.loc[cleaned_df['kit_type']=='subkit','line_type'] = 'long line'
36
+ cleaned_df.loc[cleaned_df['line_type']=='mini load', 'line_id'] = 7
37
+ cleaned_df.loc[cleaned_df['line_type']=='long line', 'line_id'] = 6
38
+ cleaned_df.to_csv(output_csv_path, index=False)
39
+ return cleaned_df
src/preprocess/extract.py CHANGED
@@ -3,77 +3,54 @@ import datetime
3
  from datetime import date, timedelta
4
  import json
5
  import os
6
- # Default dates - will be overridden by optimization_config.py
7
- START_DATE = pd.Timestamp(2025, 7, 7)
8
- END_DATE = pd.Timestamp(2025, 7, 11)
9
 
10
- def set_global_dates(start_date, end_date):
11
- """Update global START_DATE and END_DATE variables"""
12
- global START_DATE, END_DATE
13
- START_DATE = pd.Timestamp(start_date)
14
- END_DATE = pd.Timestamp(end_date)
15
- print(f"Updated global dates: {START_DATE} to {END_DATE}")
16
 
17
 
18
- def read_excel(path: str) -> pd.DataFrame:
19
- return pd.read_excel(path, dtype={"id": "Int64"})
20
-
21
-
22
- def read_demand_data(
23
- path="data/real_data_excel/converted_csv/COOIS_Planned_and_Released.csv",
24
- start_date=None,
25
- end_date=None,
26
- ) -> pd.DataFrame:
27
- df = pd.read_csv(path)
28
- df["Basic start date"] = pd.to_datetime(df["Basic start date"])
29
- # df["Basic finish date"] = pd.to_datetime(df["Basic finish date"])
30
-
31
- # Use provided dates or fall back to module defaults
32
- filter_start_date = start_date if start_date is not None else START_DATE
33
- filter_end_date = end_date if end_date is not None else END_DATE
34
-
35
- df = df[(df["Basic start date"] == filter_start_date)]
36
-
37
- return df
38
-
39
- def read_kit_line_match_data(
40
- path="data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv",
41
- ) -> pd.DataFrame:
42
  return pd.read_csv(path)
43
 
44
 
45
- def read_employee_data(
46
- path="data/real_data_excel/converted_csv/WH_Workforce_Hourly_Pay_Scale_processed.csv",
47
- ) -> pd.DataFrame:
48
  return pd.read_csv(path)
49
 
50
- def get_shift_info(
51
- path = "data/real_data_excel/converted_csv/work_shift.csv"
52
- ) -> pd.DataFrame:
53
  df = pd.read_csv(path)
54
  return df
55
 
56
 
57
- def read_shift_cost_data(
58
- path="data/real_data_excel/converted_csv/WH_Workforce_Hourly_Pay_Scale_processed.csv",
59
- ) -> pd.DataFrame:
60
  return pd.read_csv(path)
61
 
62
 
63
- def read_work_center_capacity(
64
- path="data/real_data_excel/converted_csv/Work_Centre_Capacity.csv",
65
- ) -> pd.DataFrame:
66
  return pd.read_csv(path)
67
 
68
 
69
- def read_material_master(
70
- path="data/real_data_excel/converted_csv/Material_Master_WMS.csv",
71
- ) -> pd.DataFrame:
72
  return pd.read_csv(path)
73
 
74
- def read_packaging_line_data(
75
- path="data/real_data_excel/converted_csv/Work_Centre_Capacity_processed.csv",
76
- ) -> pd.DataFrame:
77
  df = pd.read_csv(path)
78
  # Filter for packaging lines only
79
  df = df[df["line_for_packaging"] == True]
@@ -81,26 +58,23 @@ def read_packaging_line_data(
81
 
82
 
83
  def read_orders_data(
84
- path="data/real_data_excel/converted_csv/COOIS_Planned_and_Released.csv",
85
  start_date=None,
86
  # end_date=None,
87
  ) -> pd.DataFrame:
88
  """
89
- COOIS_Released_Prod_Orders.csv
90
 
91
  Args:
92
- path: path to the csv file
93
  start_date: start date (pd.Timestamp or datetime)
94
 
95
-
96
  Returns:
97
  pd.DataFrame: filtered dataframe by date
98
  """
 
99
  df = pd.read_csv(path)
100
  assert len(df) > 0, "No data found in the file"
101
  # convert date column to datetime
102
  df["Basic start date"] = pd.to_datetime(df["Basic start date"])
103
- # df["Basic finish date"] = pd.to_datetime(df["Basic finish date"])
104
 
105
 
106
  # filter by date
@@ -112,9 +86,9 @@ def read_orders_data(
112
  return df
113
 
114
 
115
- def read_package_speed_data(
116
- path="data/real_data_excel/converted_csv/Kits__Calculation.csv",
117
- ):
118
  df = pd.read_csv(path, usecols=["Kit", "Kit per day","Paid work hours per day"])
119
  df["Kit per day"] = df["Kit per day"].astype(float)
120
  df["Paid work hours per day"] = df["Paid work hours per day"].astype(float)
@@ -123,9 +97,9 @@ def read_package_speed_data(
123
  speeds_per_hour = dict(zip(df["Kit"], df["kits_per_hour"]))
124
  return speeds_per_hour
125
 
126
- def read_personnel_requirement_data(
127
- path="data/real_data_excel/converted_csv/Kits__Calculation.csv",
128
- ):
129
  df = pd.read_csv(path, usecols=["Kit", "Humanizer", "UNICEF staff"])
130
 
131
  # Clean the data by handling special whitespace characters like \xa0 (non-breaking space)
@@ -156,12 +130,14 @@ def read_personnel_requirement_data(
156
  def get_production_order_data():
157
  """
158
  Extract production order information from hierarchy.
 
159
  Returns:
160
- - kit_levels: {kit_id: level} where level 0=prepack, 1=subkit, 2=master
161
- - dependencies: {kit_id: [dependency_list]}
162
- - priority_order: [kit_ids] sorted by production priority
 
163
  """
164
- path = "data/hierarchy_exports/kit_hierarchy.json"
165
  with open(path, 'r', encoding='utf-8') as f:
166
  hierarchy = json.load(f)
167
 
 
3
  from datetime import date, timedelta
4
  import json
5
  import os
6
+ import yaml
7
+ from pathlib import Path
 
8
 
9
+ # Load paths configuration
10
+ _config_dir = Path(__file__).parent.parent / "config"
11
+ _paths_file = _config_dir / "paths.yaml"
12
+ with open(_paths_file, 'r', encoding='utf-8') as f:
13
+ PATHS = yaml.safe_load(f)
 
14
 
15
 
16
+ def read_kit_line_match_data() -> pd.DataFrame:
17
+ """Read kit composition and relation data"""
18
+ path = PATHS['data']['csv']['kit_composition']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  return pd.read_csv(path)
20
 
21
 
22
+ def read_employee_data() -> pd.DataFrame:
23
+ """Read employee workforce hourly pay scale data"""
24
+ path = PATHS['data']['csv']['workforce_pay_scale']
25
  return pd.read_csv(path)
26
 
27
+ def get_shift_info() -> pd.DataFrame:
28
+ """Read work shift information"""
29
+ path = PATHS['data']['csv']['work_shift']
30
  df = pd.read_csv(path)
31
  return df
32
 
33
 
34
+ def read_shift_cost_data() -> pd.DataFrame:
35
+ """Read shift cost data from workforce pay scale"""
36
+ path = PATHS['data']['csv']['workforce_pay_scale']
37
  return pd.read_csv(path)
38
 
39
 
40
+ def read_work_center_capacity() -> pd.DataFrame:
41
+ """Read work center capacity data"""
42
+ path = PATHS['data']['csv']['work_center_capacity']
43
  return pd.read_csv(path)
44
 
45
 
46
+ def read_material_master() -> pd.DataFrame:
47
+ """Read material master WMS data"""
48
+ path = PATHS['data']['csv']['material_master']
49
  return pd.read_csv(path)
50
 
51
+ def read_packaging_line_data() -> pd.DataFrame:
52
+ """Read packaging line data (filtered work center capacity)"""
53
+ path = PATHS['data']['csv']['work_center_capacity_processed']
54
  df = pd.read_csv(path)
55
  # Filter for packaging lines only
56
  df = df[df["line_for_packaging"] == True]
 
58
 
59
 
60
  def read_orders_data(
 
61
  start_date=None,
62
  # end_date=None,
63
  ) -> pd.DataFrame:
64
  """
65
+ Read COOIS Released Production Orders data
66
 
67
  Args:
 
68
  start_date: start date (pd.Timestamp or datetime)
69
 
 
70
  Returns:
71
  pd.DataFrame: filtered dataframe by date
72
  """
73
+ path = PATHS['data']['csv']['demand']
74
  df = pd.read_csv(path)
75
  assert len(df) > 0, "No data found in the file"
76
  # convert date column to datetime
77
  df["Basic start date"] = pd.to_datetime(df["Basic start date"])
 
78
 
79
 
80
  # filter by date
 
86
  return df
87
 
88
 
89
+ def read_package_speed_data():
90
+ """Read package speed data from Kits Calculation"""
91
+ path = PATHS['data']['csv']['kits_calculation']
92
  df = pd.read_csv(path, usecols=["Kit", "Kit per day","Paid work hours per day"])
93
  df["Kit per day"] = df["Kit per day"].astype(float)
94
  df["Paid work hours per day"] = df["Paid work hours per day"].astype(float)
 
97
  speeds_per_hour = dict(zip(df["Kit"], df["kits_per_hour"]))
98
  return speeds_per_hour
99
 
100
+ def read_personnel_requirement_data():
101
+ """Read personnel requirement data from Kits Calculation"""
102
+ path = PATHS['data']['csv']['kits_calculation']
103
  df = pd.read_csv(path, usecols=["Kit", "Humanizer", "UNICEF staff"])
104
 
105
  # Clean the data by handling special whitespace characters like \xa0 (non-breaking space)
 
130
  def get_production_order_data():
131
  """
132
  Extract production order information from hierarchy.
133
+
134
  Returns:
135
+ tuple: (kit_levels, dependencies, priority_order)
136
+ - kit_levels: {kit_id: level} where level 0=prepack, 1=subkit, 2=master
137
+ - dependencies: {kit_id: [dependency_list]}
138
+ - priority_order: [kit_ids] sorted by production priority
139
  """
140
+ path = PATHS['data']['hierarchy']['kit_hierarchy']
141
  with open(path, 'r', encoding='utf-8') as f:
142
  hierarchy = json.load(f)
143
 
src/preprocess/kit_composition_cleaner.py CHANGED
@@ -22,161 +22,206 @@ import os
22
  from typing import Tuple
23
 
24
 
25
- def load_kit_composition_data(file_path: str) -> pd.DataFrame:
26
- """Load the Kit Composition and relation CSV file."""
27
- if not os.path.exists(file_path):
28
- raise FileNotFoundError(f"File not found: {file_path}")
29
-
30
- df = pd.read_csv(file_path)
31
- print(f"Loaded {len(df)} rows from {file_path}")
32
- return df
33
-
34
-
35
- def process_master_kits(df: pd.DataFrame) -> pd.DataFrame:
36
- """
37
- Process Master Kits according to business rules:
38
- - Standalone masters (no subkits/prepacks, only components): line_type = "long line"
39
- - Non-standalone masters (have subkits/prepacks): line_type = "" (empty - no production needed)
40
- """
41
- print("Processing Master Kits...")
42
-
43
- # Identify masters with hierarchy (subkits or prepacks)
44
- masters_with_subkits = set(df[df['Sub kit'].notna()]['Master Kit'].unique())
45
- masters_with_prepacks = set(df[df['Prepack'].notna()]['Master Kit'].unique())
46
- masters_with_hierarchy = masters_with_subkits.union(masters_with_prepacks)
47
-
48
- # All masters
49
- all_masters = set(df['Master Kit'].unique())
50
-
51
- # Standalone masters are those WITHOUT subkits/prepacks (only have components)
52
- standalone_masters = all_masters - masters_with_hierarchy
53
-
54
- print(f"Total unique Master Kits: {len(all_masters)}")
55
- print(f"Masters with subkits/prepacks: {len(masters_with_hierarchy)}")
56
- print(f"Standalone masters (only components): {len(standalone_masters)}")
57
-
58
- # Create master kit records
59
- master_data = []
60
-
61
- # Get unique master kits with descriptions
62
- unique_masters = df[['Master Kit', 'Master Kit Description']].drop_duplicates()
63
-
64
- for _, row in unique_masters.iterrows():
65
- master_kit = row['Master Kit']
66
- master_desc = row['Master Kit Description']
67
-
68
- # Determine line_type based on standalone status
69
- if master_kit in standalone_masters:
70
- line_type = "long line"
71
- else:
72
- line_type = "" # Empty for non-standalone (theoretical)
73
-
74
- master_data.append({
75
- 'kit_name': master_kit,
76
- 'kit_description': master_desc,
77
- 'kit_type': 'master',
78
- 'line_type': line_type
79
- })
80
-
81
- master_df = pd.DataFrame(master_data)
82
- print(f"Created {len(master_df)} master kit records")
83
- print(f"Standalone masters with 'long line': {sum(master_df['line_type'] == 'long line')}")
84
-
85
- return master_df
86
-
87
-
88
- def process_sub_kits(df: pd.DataFrame) -> pd.DataFrame:
89
- """
90
- Process Sub Kits according to business rules:
91
- - All sub kits get line_type = "long line"
92
- - Remove duplicates
93
- """
94
- print("Processing Sub Kits...")
95
-
96
- # Filter rows that have sub kits
97
- subkit_df = df[df['Sub kit'].notna()].copy()
98
-
99
- if len(subkit_df) == 0:
100
- print("No sub kits found")
101
- return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
102
-
103
- # Get unique sub kits with descriptions
104
- unique_subkits = subkit_df[['Sub kit', 'Sub kit description']].drop_duplicates()
105
-
106
- subkit_data = []
107
- for _, row in unique_subkits.iterrows():
108
- subkit_data.append({
109
- 'kit_name': row['Sub kit'],
110
- 'kit_description': row['Sub kit description'],
111
- 'kit_type': 'subkit',
112
- 'line_type': 'long line'
113
- })
114
-
115
- subkit_result = pd.DataFrame(subkit_data)
116
- print(f"Created {len(subkit_result)} sub kit records")
117
-
118
- return subkit_result
119
-
120
-
121
- def process_prepacks(df: pd.DataFrame) -> pd.DataFrame:
122
- """
123
- Process Prepacks according to business rules:
124
- - All prepacks get line_type = "miniload"
125
- - Remove duplicates
126
  """
127
- print("Processing Prepacks...")
128
-
129
- # Filter rows that have prepacks
130
- prepack_df = df[df['Prepack'].notna()].copy()
131
-
132
- if len(prepack_df) == 0:
133
- print("No prepacks found")
134
- return pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
135
 
136
- # Get unique prepacks with descriptions
137
- unique_prepacks = prepack_df[['Prepack', 'Prepack Description']].drop_duplicates()
138
-
139
- prepack_data = []
140
- for _, row in unique_prepacks.iterrows():
141
- prepack_data.append({
142
- 'kit_name': row['Prepack'],
143
- 'kit_description': row['Prepack Description'],
144
- 'kit_type': 'prepack',
145
- 'line_type': 'miniload'
146
- })
147
-
148
- prepack_result = pd.DataFrame(prepack_data)
149
- print(f"Created {len(prepack_result)} prepack records")
150
-
151
- return prepack_result
152
-
153
-
154
- def concatenate_and_save(master_df: pd.DataFrame, subkit_df: pd.DataFrame,
155
- prepack_df: pd.DataFrame, output_path: str) -> pd.DataFrame:
156
  """
157
- Concatenate all processed dataframes and save to output file.
158
- """
159
- print("Concatenating results...")
160
-
161
- # Concatenate all dataframes
162
- final_df = pd.concat([master_df, subkit_df, prepack_df], ignore_index=True)
163
 
164
- # Ensure empty strings instead of NaN for line_type
165
- final_df['line_type'] = final_df['line_type'].fillna('')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- # Sort by kit_type for better organization
168
- final_df = final_df.sort_values(['kit_type', 'kit_name']).reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- print(f"Final dataset contains {len(final_df)} records:")
171
- print(f" - Masters: {len(master_df)}")
172
- print(f" - Subkits: {len(subkit_df)}")
173
- print(f" - Prepacks: {len(prepack_df)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
- # Save to file (keep empty strings as empty, not NaN)
176
- final_df.to_csv(output_path, index=False, na_rep='')
177
- print(f"Saved cleaned data to: {output_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- return final_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
 
182
  def main():
@@ -187,19 +232,17 @@ def main():
187
  output_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv")
188
 
189
  try:
190
- # Load the original data
191
- df = load_kit_composition_data(input_file)
192
-
193
- # Process each type of kit
194
- master_df = process_master_kits(df)
195
- subkit_df = process_sub_kits(df)
196
- prepack_df = process_prepacks(df)
197
 
198
- # Concatenate and save
199
- final_df = concatenate_and_save(master_df, subkit_df, prepack_df, output_file)
 
 
 
 
200
 
201
  # Display summary statistics
202
- print("\n=== SUMMARY ===")
203
  print("Line type distribution:")
204
  print(final_df['line_type'].value_counts(dropna=False))
205
  print("\nKit type distribution:")
@@ -207,9 +250,9 @@ def main():
207
 
208
  print("\nSample of final data:")
209
  print(final_df.head(10))
210
-
211
  except Exception as e:
212
- print(f"Error processing kit composition data: {e}")
213
  raise
214
 
215
 
 
22
  from typing import Tuple
23
 
24
 
25
+ class KitCompositionCleaner:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  """
27
+ Cleans and processes kit composition data with line type assignments.
 
 
 
 
 
 
 
28
 
29
+ This class maintains state across processing steps, allowing for:
30
+ - Single data load
31
+ - Step-by-step processing
32
+ - Intermediate result storage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  """
 
 
 
 
 
 
34
 
35
+ def __init__(self, input_file: str, output_file: str = None):
36
+ """
37
+ Initialize the cleaner with file paths.
38
+
39
+ Args:
40
+ input_file: Path to input CSV file (Kit_Composition_and_relation.csv)
41
+ output_file: Path to output CSV file (optional, can be set later)
42
+ """
43
+ self.input_file = input_file
44
+ self.output_file = output_file
45
+
46
+ # State variables for processing pipeline
47
+ self.df = None
48
+ self.master_df = None
49
+ self.subkit_df = None
50
+ self.prepack_df = None
51
+ self.final_df = None
52
+
53
+ def load_data(self) -> pd.DataFrame:
54
+ """Load the Kit Composition and relation CSV file."""
55
+ if not os.path.exists(self.input_file):
56
+ raise FileNotFoundError(f"File not found: {self.input_file}")
57
+
58
+ self.df = pd.read_csv(self.input_file)
59
+ print(f"Loaded {len(self.df)} rows from {self.input_file}")
60
+ return self.df
61
 
62
+ def process_master_kits(self) -> pd.DataFrame:
63
+ """
64
+ Process Master Kits according to business rules:
65
+ - Standalone masters (no subkits/prepacks, only components): line_type = "long line"
66
+ - Non-standalone masters (have subkits/prepacks): line_type = "" (empty - no production needed)
67
+ """
68
+ if self.df is None:
69
+ raise ValueError("Data not loaded. Call load_data() first.")
70
+
71
+ print("Processing Master Kits...")
72
+
73
+ # Identify masters with hierarchy (subkits or prepacks)
74
+ masters_with_subkits = set(self.df[self.df['Sub kit'].notna()]['Master Kit'].unique())
75
+ masters_with_prepacks = set(self.df[self.df['Prepack'].notna()]['Master Kit'].unique())
76
+ masters_with_hierarchy = masters_with_subkits.union(masters_with_prepacks)
77
+
78
+ # All masters
79
+ all_masters = set(self.df['Master Kit'].unique())
80
+
81
+ # Standalone masters are those WITHOUT subkits/prepacks (only have components)
82
+ standalone_masters = all_masters - masters_with_hierarchy
83
+
84
+ print(f"Total unique Master Kits: {len(all_masters)}")
85
+ print(f"Masters with subkits/prepacks: {len(masters_with_hierarchy)}")
86
+ print(f"Standalone masters (only components): {len(standalone_masters)}")
87
+
88
+ # Create master kit records
89
+ master_data = []
90
+
91
+ # Get unique master kits with descriptions
92
+ unique_masters = self.df[['Master Kit', 'Master Kit Description']].drop_duplicates()
93
+
94
+ for _, row in unique_masters.iterrows():
95
+ master_kit = row['Master Kit']
96
+ master_desc = row['Master Kit Description']
97
+
98
+ # Determine line_type based on standalone status
99
+ if master_kit in standalone_masters:
100
+ line_type = "long line"
101
+ else:
102
+ line_type = "" # Empty for non-standalone (theoretical)
103
+
104
+ master_data.append({
105
+ 'kit_name': master_kit,
106
+ 'kit_description': master_desc,
107
+ 'kit_type': 'master',
108
+ 'line_type': line_type
109
+ })
110
+
111
+ self.master_df = pd.DataFrame(master_data)
112
+ print(f"Created {len(self.master_df)} master kit records")
113
+ print(f"Standalone masters with 'long line': {sum(self.master_df['line_type'] == 'long line')}")
114
+
115
+ return self.master_df
116
 
117
+ def process_sub_kits(self) -> pd.DataFrame:
118
+ """
119
+ Process Sub Kits according to business rules:
120
+ - All sub kits get line_type = "long line"
121
+ - Remove duplicates
122
+ """
123
+ if self.df is None:
124
+ raise ValueError("Data not loaded. Call load_data() first.")
125
+
126
+ print("Processing Sub Kits...")
127
+
128
+ # Filter rows that have sub kits
129
+ subkit_df = self.df[self.df['Sub kit'].notna()].copy()
130
+
131
+ if len(subkit_df) == 0:
132
+ print("No sub kits found")
133
+ self.subkit_df = pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
134
+ return self.subkit_df
135
+
136
+ # Get unique sub kits with descriptions
137
+ unique_subkits = subkit_df[['Sub kit', 'Sub kit description']].drop_duplicates()
138
+
139
+ subkit_data = []
140
+ for _, row in unique_subkits.iterrows():
141
+ subkit_data.append({
142
+ 'kit_name': row['Sub kit'],
143
+ 'kit_description': row['Sub kit description'],
144
+ 'kit_type': 'subkit',
145
+ 'line_type': 'long line'
146
+ })
147
+
148
+ self.subkit_df = pd.DataFrame(subkit_data)
149
+ print(f"Created {len(self.subkit_df)} sub kit records")
150
+
151
+ return self.subkit_df
152
 
153
+ def process_prepacks(self) -> pd.DataFrame:
154
+ """
155
+ Process Prepacks according to business rules:
156
+ - All prepacks get line_type = "miniload"
157
+ - Remove duplicates
158
+ """
159
+ if self.df is None:
160
+ raise ValueError("Data not loaded. Call load_data() first.")
161
+
162
+ print("Processing Prepacks...")
163
+
164
+ # Filter rows that have prepacks
165
+ prepack_df = self.df[self.df['Prepack'].notna()].copy()
166
+
167
+ if len(prepack_df) == 0:
168
+ print("No prepacks found")
169
+ self.prepack_df = pd.DataFrame(columns=['kit_name', 'kit_description', 'kit_type', 'line_type'])
170
+ return self.prepack_df
171
+
172
+ # Get unique prepacks with descriptions
173
+ unique_prepacks = prepack_df[['Prepack', 'Prepack Description']].drop_duplicates()
174
+
175
+ prepack_data = []
176
+ for _, row in unique_prepacks.iterrows():
177
+ prepack_data.append({
178
+ 'kit_name': row['Prepack'],
179
+ 'kit_description': row['Prepack Description'],
180
+ 'kit_type': 'prepack',
181
+ 'line_type': 'miniload'
182
+ })
183
+
184
+ self.prepack_df = pd.DataFrame(prepack_data)
185
+ print(f"Created {len(self.prepack_df)} prepack records")
186
+
187
+ return self.prepack_df
188
 
189
+ def concatenate_and_save(self, output_path: str = None) -> pd.DataFrame:
190
+ """
191
+ Concatenate all processed dataframes and save to output file.
192
+
193
+ Args:
194
+ output_path: Path to save the output file (uses self.output_file if not provided)
195
+ """
196
+ if self.master_df is None or self.subkit_df is None or self.prepack_df is None:
197
+ raise ValueError("Processing not complete. Run process_master_kits(), process_sub_kits(), and process_prepacks() first.")
198
+
199
+ print("Concatenating results...")
200
+
201
+ # Concatenate all dataframes
202
+ self.final_df = pd.concat([self.master_df, self.subkit_df, self.prepack_df], ignore_index=True)
203
+
204
+ # Ensure empty strings instead of NaN for line_type
205
+ self.final_df['line_type'] = self.final_df['line_type'].fillna('')
206
+
207
+ # Sort by kit_type for better organization
208
+ self.final_df = self.final_df.sort_values(['kit_type', 'kit_name']).reset_index(drop=True)
209
+
210
+ print(f"Final dataset contains {len(self.final_df)} records:")
211
+ print(f" - Masters: {len(self.master_df)}")
212
+ print(f" - Subkits: {len(self.subkit_df)}")
213
+ print(f" - Prepacks: {len(self.prepack_df)}")
214
+
215
+ # Determine output path
216
+ save_path = output_path or self.output_file
217
+ if save_path is None:
218
+ raise ValueError("No output path provided. Specify output_path parameter or set self.output_file")
219
+
220
+ # Save to file (keep empty strings as empty, not NaN)
221
+ self.final_df.to_csv(save_path, index=False, na_rep='')
222
+ print(f"Saved cleaned data to: {save_path}")
223
+
224
+ return self.final_df
225
 
226
 
227
  def main():
 
232
  output_file = os.path.join(base_dir, "data/real_data_excel/converted_csv/Kit_Composition_and_relation_cleaned_with_line_type.csv")
233
 
234
  try:
235
+ # Initialize cleaner with class
236
+ cleaner = KitCompositionCleaner(input_file, output_file)
 
 
 
 
 
237
 
238
+ # Execute pipeline step by step
239
+ cleaner.load_data()
240
+ cleaner.process_master_kits()
241
+ cleaner.process_sub_kits()
242
+ cleaner.process_prepacks()
243
+ final_df = cleaner.concatenate_and_save()
244
 
245
  # Display summary statistics
 
246
  print("Line type distribution:")
247
  print(final_df['line_type'].value_counts(dropna=False))
248
  print("\nKit type distribution:")
 
250
 
251
  print("\nSample of final data:")
252
  print(final_df.head(10))
253
+
254
  except Exception as e:
255
+ print(f"โŒ Error processing kit composition data: {e}")
256
  raise
257
 
258
 
src/preprocess/transform.py CHANGED
@@ -2,8 +2,14 @@ import pandas as pd
2
  import src.preprocess.extract as ex
3
 
4
 
5
- def get_product_list():
6
- demand = ex.read_demand_data()
 
 
 
 
 
 
7
  print(demand["Material Number"].unique())
8
  return demand["Material Number"].unique()
9
 
@@ -16,13 +22,15 @@ def get_employee_list():
16
 
17
 
18
  def get_released_product_list(start_date=None):
 
 
19
 
 
 
 
20
  released_orders = ex.read_orders_data(
21
  start_date=start_date,
22
- # end_date=end_date
23
  )
24
-
25
-
26
  product_list = released_orders["Material Number"].unique().tolist()
27
  print(f"Released products for date range {start_date}: {len(product_list)} products")
28
  return product_list
@@ -30,23 +38,19 @@ def get_released_product_list(start_date=None):
30
 
31
  def get_available_dates():
32
  """
33
- COOIS_Released_Prod_Orders.csv์—์„œ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋ชจ๋“  ๋‚ ์งœ๋ฅผ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
34
 
35
  Returns:
36
- tuple: (start_dates, end_dates) - ๊ณ ์œ ํ•œ ์‹œ์ž‘ ๋‚ ์งœ์™€ ์ข…๋ฃŒ ๋‚ ์งœ ๋ฆฌ์ŠคํŠธ
37
  """
38
- # ๋ชจ๋“  ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ ธ์˜ด (๋‚ ์งœ ํ•„ํ„ฐ๋ง ์—†์ด)
39
  released_orders = ex.read_orders_data()
40
 
41
- # ๋‚ ์งœ ์ปฌ๋Ÿผ์„ datetime์œผ๋กœ ๋ณ€ํ™˜ (์ด๋ฏธ extract.py์—์„œ ๋ณ€ํ™˜๋˜์ง€๋งŒ ํ™•์‹คํžˆ ํ•˜๊ธฐ ์œ„ํ•ด)
42
  released_orders["Basic start date"] = pd.to_datetime(released_orders["Basic start date"])
43
  released_orders["Basic finish date"] = pd.to_datetime(released_orders["Basic finish date"])
44
 
45
- # ๊ณ ์œ ํ•œ ์‹œ์ž‘ ๋‚ ์งœ์™€ ์ข…๋ฃŒ ๋‚ ์งœ ์ถ”์ถœ
46
  start_dates = sorted(released_orders["Basic start date"].dt.date.unique())
47
  end_dates = sorted(released_orders["Basic finish date"].dt.date.unique())
48
 
49
- # ๋ชจ๋“  ๊ณ ์œ ํ•œ ๋‚ ์งœ๋“ค (์‹œ์ž‘๋‚ ์งœ + ์ข…๋ฃŒ๋‚ ์งœ)
50
  all_dates = sorted(set(start_dates + end_dates))
51
 
52
  return all_dates, start_dates, end_dates
@@ -54,25 +58,21 @@ def get_available_dates():
54
 
55
  def get_date_ranges():
56
  """
57
- COOIS_Released_Prod_Orders.csv์—์„œ ๋‚ ์งœ ๋ฒ”์œ„ ์กฐํ•ฉ์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
58
-
59
  Returns:
60
- list: ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ (start_date, end_date) ์กฐํ•ฉ ๋ฆฌ์ŠคํŠธ
61
  """
62
  released_orders = ex.read_orders_data()
63
 
64
- # ๋‚ ์งœ ์ปฌ๋Ÿผ์„ datetime์œผ๋กœ ๋ณ€ํ™˜
65
  released_orders["Basic start date"] = pd.to_datetime(released_orders["Basic start date"])
66
  released_orders["Basic finish date"] = pd.to_datetime(released_orders["Basic finish date"])
67
 
68
- # ๊ณ ์œ ํ•œ ๋‚ ์งœ ๋ฒ”์œ„ ์กฐํ•ฉ ์ถ”์ถœ
69
  date_ranges = released_orders[["Basic start date", "Basic finish date"]].drop_duplicates()
70
  date_ranges["start_date"] = date_ranges["Basic start date"].dt.date
71
  date_ranges["end_date"] = date_ranges["Basic finish date"].dt.date
72
 
73
- # (start_date, end_date) ํŠœํ”Œ ๋ฆฌ์ŠคํŠธ๋กœ ๋ฐ˜ํ™˜
74
  ranges = [(row["start_date"], row["end_date"]) for _, row in date_ranges.iterrows()]
75
- ranges = sorted(set(ranges)) # ์ค‘๋ณต ์ œ๊ฑฐ ๋ฐ ์ •๋ ฌ
76
 
77
  return ranges
78
 
 
2
  import src.preprocess.extract as ex
3
 
4
 
5
+ def get_product_list(start_date=None):
6
+ """
7
+ Get unique product list from demand data
8
+
9
+ Args:
10
+ start_date: start date to filter data. Required.
11
+ """
12
+ demand = ex.read_orders_data(start_date=start_date)
13
  print(demand["Material Number"].unique())
14
  return demand["Material Number"].unique()
15
 
 
22
 
23
 
24
  def get_released_product_list(start_date=None):
25
+ """
26
+ get released product list from COOIS_Released_Prod_Orders.csv
27
 
28
+ Args:
29
+ start_date: start date to filter data. Required.
30
+ """
31
  released_orders = ex.read_orders_data(
32
  start_date=start_date,
 
33
  )
 
 
34
  product_list = released_orders["Material Number"].unique().tolist()
35
  print(f"Released products for date range {start_date}: {len(product_list)} products")
36
  return product_list
 
38
 
39
  def get_available_dates():
40
  """
41
+ get available all dates from COOIS_Released_Prod_Orders.csv
42
 
43
  Returns:
44
+ tuple: (start_dates, end_dates) - unique start dates and end dates list
45
  """
 
46
  released_orders = ex.read_orders_data()
47
 
 
48
  released_orders["Basic start date"] = pd.to_datetime(released_orders["Basic start date"])
49
  released_orders["Basic finish date"] = pd.to_datetime(released_orders["Basic finish date"])
50
 
 
51
  start_dates = sorted(released_orders["Basic start date"].dt.date.unique())
52
  end_dates = sorted(released_orders["Basic finish date"].dt.date.unique())
53
 
 
54
  all_dates = sorted(set(start_dates + end_dates))
55
 
56
  return all_dates, start_dates, end_dates
 
58
 
59
  def get_date_ranges():
60
  """
61
+ get available (start_date, end_date) combinations
 
62
  Returns:
63
+ list : available (start_date, end_date) combinations
64
  """
65
  released_orders = ex.read_orders_data()
66
 
 
67
  released_orders["Basic start date"] = pd.to_datetime(released_orders["Basic start date"])
68
  released_orders["Basic finish date"] = pd.to_datetime(released_orders["Basic finish date"])
69
 
 
70
  date_ranges = released_orders[["Basic start date", "Basic finish date"]].drop_duplicates()
71
  date_ranges["start_date"] = date_ranges["Basic start date"].dt.date
72
  date_ranges["end_date"] = date_ranges["Basic finish date"].dt.date
73
 
 
74
  ranges = [(row["start_date"], row["end_date"]) for _, row in date_ranges.iterrows()]
75
+ ranges = sorted(set(ranges))
76
 
77
  return ranges
78