rairo commited on
Commit
a66eb56
·
verified ·
1 Parent(s): fc7b3ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -23
app.py CHANGED
@@ -39,33 +39,35 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
39
  """
40
  Standardize DataFrame column names and data types:
41
  - Drops any middle name columns.
42
- - Cleans all column names (e.g., "Employee Name" -> "employee_name").
43
- - Renames synonyms to common names (e.g., 'tin', 'salary').
 
44
  - Creates an 'employee_name' column if missing but first_name and last_name exist.
45
  - Combines duplicate key columns into one.
46
- - Forces the key columns 'tin' and 'employee_name' to be strings.
47
  """
48
  # Drop any column that appears to be a middle name
49
  middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
50
  if middle_name_cols:
51
  df = df.drop(columns=middle_name_cols)
52
 
53
- # Clean all column names first so that "Employee Name" becomes "employee_name"
54
  df.columns = [clean_column_name(col) for col in df.columns]
55
 
56
- # Rename columns based on synonyms for TIN and salary
 
57
  rename_map = {}
58
  for col in df.columns:
59
  if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
60
  rename_map[col] = 'tin'
61
- elif 'tin' in col:
62
  rename_map[col] = 'tin'
63
  if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
64
  rename_map[col] = 'salary'
65
  if rename_map:
66
  df = df.rename(columns=rename_map)
67
 
68
- # Combine duplicate columns (e.g., multiple salary or tin columns)
69
  if 'salary' in df.columns and list(df.columns).count('salary') > 1:
70
  salary_cols = [col for col in df.columns if col == 'salary']
71
  df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
@@ -75,11 +77,11 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
75
  df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
76
  df = df.loc[:, ~df.columns.duplicated()]
77
 
78
- # If employee_name is missing and first_name and last_name exist, create it.
79
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
80
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
81
 
82
- # Ensure key columns are of the correct type
83
  if 'salary' in df.columns:
84
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
85
  if 'tin' in df.columns:
@@ -179,15 +181,16 @@ def read_excel_file(file) -> pd.DataFrame:
179
  def merge_with_master(processed_files):
180
  """
181
  Merge multiple DataFrames using a two-step process:
182
- 1. Use the earnings file as the master. Drop its inaccurate 'tin' column.
183
- 2. Merge template info onto earnings via 'employee_name' (the key provided by "Employee Name").
184
- 3. Then merge the resulting DataFrame with the PAYE file using the (correct) 'tin' key.
 
185
  """
186
  earnings_file = None
187
  paye_file = None
188
  template_file = None
189
 
190
- # Identify files based on filename keywords
191
  for file_info in processed_files:
192
  lower_filename = file_info["filename"].lower()
193
  if "earnings" in lower_filename:
@@ -196,16 +199,17 @@ def merge_with_master(processed_files):
196
  paye_file = file_info
197
  elif "template" in lower_filename:
198
  template_file = file_info
 
199
  if not earnings_file:
200
  st.warning("No earnings file found as master. Using the first file as master.")
201
  earnings_file = processed_files[0]
202
 
203
- # Use the earnings DataFrame as the master
204
  earnings_df = earnings_file["df"]
205
- # Drop the inaccurate 'tin' column from earnings, if present
206
  if 'tin' in earnings_df.columns:
207
  earnings_df = earnings_df.drop(columns=['tin'])
208
- # Double-check removal of any middle_name column (should already be done in standardization)
209
  if 'middle_name' in earnings_df.columns:
210
  earnings_df = earnings_df.drop(columns=['middle_name'])
211
 
@@ -215,10 +219,10 @@ def merge_with_master(processed_files):
215
  if template_file is not None:
216
  st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
217
  template_df = template_file["df"]
218
- # Drop any middle_name column from the template file
219
  if 'middle_name' in template_df.columns:
220
  template_df = template_df.drop(columns=['middle_name'])
221
- # Ensure template has an 'employee_name' column (constructed if necessary)
222
  if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
223
  template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
224
  if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
@@ -226,16 +230,21 @@ def merge_with_master(processed_files):
226
  else:
227
  st.warning("Column 'employee_name' missing in either the earnings or template file. Skipping template merge.")
228
  else:
229
- st.warning("No template file detected.")
230
 
 
 
 
 
 
231
  # Merge PAYE figures onto the merged DataFrame using 'tin'
232
  if paye_file is not None:
233
  st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
234
  paye_df = paye_file["df"]
235
- if 'tin' in merged_df.columns and 'tin' in paye_df.columns:
236
  merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
237
  else:
238
- st.warning("Column 'tin' missing in either the merged or PAYE file. Skipping PAYE merge.")
239
  else:
240
  st.warning("No PAYE file detected.")
241
 
@@ -268,7 +277,7 @@ def main():
268
  if df.empty:
269
  st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
270
  continue
271
- # Standardize column names and key columns
272
  df = standardize_dataframe(df)
273
  st.write("Initial Preview:")
274
  st.dataframe(df.head())
@@ -277,7 +286,7 @@ def main():
277
  if analysis:
278
  st.write("Column Analysis:")
279
  st.json(analysis)
280
- # Apply suggested renames if provided by the analysis
281
  if 'suggested_renames' in analysis:
282
  df = df.rename(columns=analysis['suggested_renames'])
283
  processed_files.append(
 
39
  """
40
  Standardize DataFrame column names and data types:
41
  - Drops any middle name columns.
42
+ - Cleans all column names (e.g., "Employee Name" becomes "employee_name").
43
+ - Renames synonyms to common names (e.g., mapping TIN-related columns to 'tin'
44
+ and salary-related columns to 'salary').
45
  - Creates an 'employee_name' column if missing but first_name and last_name exist.
46
  - Combines duplicate key columns into one.
47
+ - Forces key columns (tin and employee_name) to be strings.
48
  """
49
  # Drop any column that appears to be a middle name
50
  middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
51
  if middle_name_cols:
52
  df = df.drop(columns=middle_name_cols)
53
 
54
+ # Clean all column names so that "Employee Name" becomes "employee_name", etc.
55
  df.columns = [clean_column_name(col) for col in df.columns]
56
 
57
+ # Build a rename map for TIN and salary synonyms.
58
+ # Note: This will capture PAYE's "tin_or_personal_id_of_employee" too.
59
  rename_map = {}
60
  for col in df.columns:
61
  if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
62
  rename_map[col] = 'tin'
63
+ elif 'tin' in col and 'tin' not in rename_map.get(col, ''):
64
  rename_map[col] = 'tin'
65
  if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
66
  rename_map[col] = 'salary'
67
  if rename_map:
68
  df = df.rename(columns=rename_map)
69
 
70
+ # Combine duplicate columns for salary and tin if needed
71
  if 'salary' in df.columns and list(df.columns).count('salary') > 1:
72
  salary_cols = [col for col in df.columns if col == 'salary']
73
  df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
 
77
  df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
78
  df = df.loc[:, ~df.columns.duplicated()]
79
 
80
+ # If employee_name is missing but first_name and last_name exist, create it.
81
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
82
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
83
 
84
+ # Ensure key columns are of the proper type.
85
  if 'salary' in df.columns:
86
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
87
  if 'tin' in df.columns:
 
181
  def merge_with_master(processed_files):
182
  """
183
  Merge multiple DataFrames using a two-step process:
184
+ 1. Use the earnings file as master and drop its inaccurate 'tin' column.
185
+ 2. Merge template info onto earnings using 'employee_name' (the key provided by "Employee Name").
186
+ The trusted 'tin' comes from the template file.
187
+ 3. Merge the combined earnings–template DataFrame with the PAYE file using 'tin'.
188
  """
189
  earnings_file = None
190
  paye_file = None
191
  template_file = None
192
 
193
+ # Identify files based on filename keywords.
194
  for file_info in processed_files:
195
  lower_filename = file_info["filename"].lower()
196
  if "earnings" in lower_filename:
 
199
  paye_file = file_info
200
  elif "template" in lower_filename:
201
  template_file = file_info
202
+
203
  if not earnings_file:
204
  st.warning("No earnings file found as master. Using the first file as master.")
205
  earnings_file = processed_files[0]
206
 
207
+ # Use the earnings DataFrame as the master.
208
  earnings_df = earnings_file["df"]
209
+ # Drop the inaccurate 'tin' column from earnings if it exists.
210
  if 'tin' in earnings_df.columns:
211
  earnings_df = earnings_df.drop(columns=['tin'])
212
+ # Double-check removal of any middle_name column (should be done in standardization).
213
  if 'middle_name' in earnings_df.columns:
214
  earnings_df = earnings_df.drop(columns=['middle_name'])
215
 
 
219
  if template_file is not None:
220
  st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
221
  template_df = template_file["df"]
222
+ # Drop any middle_name column from the template file.
223
  if 'middle_name' in template_df.columns:
224
  template_df = template_df.drop(columns=['middle_name'])
225
+ # Ensure template has an 'employee_name' column (construct if necessary).
226
  if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
227
  template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
228
  if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
 
230
  else:
231
  st.warning("Column 'employee_name' missing in either the earnings or template file. Skipping template merge.")
232
  else:
233
+ st.warning("No template file detected. Cannot proceed without a trusted TIN from the template.")
234
 
235
+ # After merging, check that a trusted 'tin' is present from the template.
236
+ if 'tin' not in merged_df.columns:
237
+ st.error("No trusted 'tin' column found in the merged earnings-template data. Aborting further merge.")
238
+ return merged_df
239
+
240
  # Merge PAYE figures onto the merged DataFrame using 'tin'
241
  if paye_file is not None:
242
  st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
243
  paye_df = paye_file["df"]
244
+ if 'tin' in paye_df.columns:
245
  merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
246
  else:
247
+ st.warning("Column 'tin' missing in the PAYE file. Skipping PAYE merge.")
248
  else:
249
  st.warning("No PAYE file detected.")
250
 
 
277
  if df.empty:
278
  st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
279
  continue
280
+ # Standardize columns and key identifiers.
281
  df = standardize_dataframe(df)
282
  st.write("Initial Preview:")
283
  st.dataframe(df.head())
 
286
  if analysis:
287
  st.write("Column Analysis:")
288
  st.json(analysis)
289
+ # Apply any suggested renames from the analysis.
290
  if 'suggested_renames' in analysis:
291
  df = df.rename(columns=analysis['suggested_renames'])
292
  processed_files.append(