rairo commited on
Commit
1fc5859
·
verified ·
1 Parent(s): c484caf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -56
app.py CHANGED
@@ -27,52 +27,66 @@ def normalize_columns(df):
27
 
28
  def read_excel_file(file, header_option=0):
29
  """
30
- Read an Excel file and normalize its column names.
31
- If your file uses multi-row headers, consider setting header_option=[0,1]
32
- and then flattening the MultiIndex.
33
  """
34
  df = pd.read_excel(file, header=header_option)
35
  df = normalize_columns(df)
 
 
36
  return df
37
 
 
 
 
 
 
 
 
 
 
 
 
38
  def process_employee_data(df):
39
- """Process employee personal information and create a clean TIN."""
40
  df = normalize_columns(df)
41
 
42
- # Create Employee Name if possible.
43
- if 'First Name' in df.columns and 'Last Name' in df.columns:
44
- df['Employee Name'] = df.apply(
45
- lambda x: f"{clean_name(x['First Name'])} {clean_name(x['Last Name'])}",
46
- axis=1
47
- )
48
 
49
- # Use either the "TIN" or "Personal ID of Employee" column.
50
  if 'TIN' in df.columns:
51
  df['TIN'] = df['TIN'].apply(standardize_tin)
52
- elif 'Personal ID of Employee' in df.columns:
53
- df['TIN'] = df['Personal ID of Employee'].apply(standardize_tin)
54
- df.drop(columns=['Personal ID of Employee'], inplace=True)
55
  else:
56
- raise KeyError("Employee data must contain a 'TIN' or 'Personal ID of Employee' column.")
 
 
 
 
 
57
 
58
  return df
59
 
60
  def process_salary_data(df):
61
- """Process salary (earnings) data."""
62
  df = normalize_columns(df)
63
 
64
- # Get the TIN column from one of the expected names.
65
  if 'TIN' in df.columns:
66
  df['TIN'] = df['TIN'].apply(standardize_tin)
67
- elif 'TIN or Personal ID of Employee' in df.columns:
68
- df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
69
- df.drop(columns=['TIN or Personal ID of Employee'], inplace=True)
70
  else:
71
- raise KeyError("Salary data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
 
 
 
 
 
72
 
73
- # Convert columns (other than known text columns) to numeric.
74
- ignore_cols = {'TIN', 'First Name', 'Middle Name', 'Last Name', 'Employee Name',
75
- 'Birth Date', 'Employed From date', 'Employed To date', 'Position'}
76
  for col in df.columns:
77
  if col not in ignore_cols:
78
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
@@ -80,18 +94,20 @@ def process_salary_data(df):
80
  return df
81
 
82
  def process_paye_data(df):
83
- """Process PAYE data."""
84
  df = normalize_columns(df)
85
 
86
  if 'TIN' in df.columns:
87
  df['TIN'] = df['TIN'].apply(standardize_tin)
88
- elif 'TIN or Personal ID of Employee' in df.columns:
89
- df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
90
- df.drop(columns=['TIN or Personal ID of Employee'], inplace=True)
91
  else:
92
- raise KeyError("PAYE data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
 
 
 
 
 
93
 
94
- # Convert columns (other than known text/date columns) to numeric.
95
  ignore_cols = {'TIN', 'Employed From date', 'Employed To date'}
96
  for col in df.columns:
97
  if col not in ignore_cols:
@@ -101,26 +117,39 @@ def process_paye_data(df):
101
 
102
  def merge_dataframes(employee_df, salary_df, paye_df):
103
  """
104
- Merge employee, salary, and PAYE data.
105
- For overlapping columns (from salary and PAYE) we combine values so that nonzero
106
- values are retained.
107
  """
108
- # Merge employee and salary data. (The earnings data is the master.)
109
- merged_df = pd.merge(employee_df, salary_df, on='TIN', how='outer', suffixes=('', '_salary'))
 
 
 
 
 
110
 
111
- # Merge PAYE data.
112
- merged_df = pd.merge(merged_df, paye_df, on='TIN', how='outer', suffixes=('', '_paye'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Combine columns that were duplicated by the merge.
115
- # For any column that appears as "Column", "Column_salary", and/or "Column_paye",
116
- # we use nonzero (or non-null) values where available.
117
- all_columns = list(merged_df.columns)
118
- for col in all_columns:
119
- for suffix in ['_salary', '_paye']:
120
- dup_col = col + suffix
121
- if dup_col in merged_df.columns:
122
- merged_df[col] = merged_df[col].combine_first(merged_df[dup_col])
123
- merged_df.drop(columns=[dup_col], inplace=True)
124
 
125
  return merged_df
126
 
@@ -129,32 +158,30 @@ def main():
129
  st.write("""
130
  Upload the following files:
131
  1. Employee Information File (template)
132
- 2. Salary (earnings) Information File
133
  3. PAYE Information File
134
  """)
135
 
136
  employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
137
- salary_file = st.file_uploader("Upload Salary Information", type=['xlsx', 'xls'])
138
  paye_file = st.file_uploader("Upload PAYE Information", type=['xlsx', 'xls'])
139
 
140
  if employee_file and salary_file and paye_file:
141
  try:
142
- # If your earnings/PAYE files have extra header rows (e.g. a row with currency codes),
143
- # adjust header_option (e.g., header=[0,1]) and then flatten the columns.
144
  employee_df = read_excel_file(employee_file, header_option=0)
145
- salary_df = read_excel_file(salary_file, header_option=0)
146
- paye_df = read_excel_file(paye_file, header_option=0)
147
 
148
  employee_df = process_employee_data(employee_df)
149
- salary_df = process_salary_data(salary_df)
150
- paye_df = process_paye_data(paye_df)
151
 
152
  final_df = merge_dataframes(employee_df, salary_df, paye_df)
153
 
154
  st.subheader("Master Payroll Data Preview")
155
  st.dataframe(final_df)
156
 
157
- # Prepare the Excel file for download.
158
  output = BytesIO()
159
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
160
  final_df.to_excel(writer, index=False, sheet_name='Master Payroll')
 
27
 
28
  def read_excel_file(file, header_option=0):
29
  """
30
+ Read an Excel file, normalize column names,
31
+ and drop completely empty rows/columns.
 
32
  """
33
  df = pd.read_excel(file, header=header_option)
34
  df = normalize_columns(df)
35
+ df.dropna(axis=0, how='all', inplace=True)
36
+ df.dropna(axis=1, how='all', inplace=True)
37
  return df
38
 
39
+ def get_column(df, possible_names):
40
+ """
41
+ Return the first matching column name (case-insensitive) from df.columns.
42
+ If none is found, return None.
43
+ """
44
+ lower_cols = {col.lower(): col for col in df.columns}
45
+ for name in possible_names:
46
+ if name.lower() in lower_cols:
47
+ return lower_cols[name.lower()]
48
+ return None
49
+
50
  def process_employee_data(df):
51
+ """Process employee personal information; create clean TIN and Employee Name."""
52
  df = normalize_columns(df)
53
 
54
+ # Create Employee Name if not present by combining first and last name.
55
+ if 'Employee Name' not in df.columns or df['Employee Name'].isna().all():
56
+ first_name_col = get_column(df, ["First Name", "First", "Forename"])
57
+ last_name_col = get_column(df, ["Last Name", "Surname", "Family Name", "Last"])
58
+ if first_name_col and last_name_col:
59
+ df["Employee Name"] = df[first_name_col].apply(clean_name) + " " + df[last_name_col].apply(clean_name)
60
 
61
+ # Standardize TIN using one of the expected headers.
62
  if 'TIN' in df.columns:
63
  df['TIN'] = df['TIN'].apply(standardize_tin)
 
 
 
64
  else:
65
+ alt = get_column(df, ["Personal ID of Employee"])
66
+ if alt:
67
+ df['TIN'] = df[alt].apply(standardize_tin)
68
+ df.drop(columns=[alt], inplace=True)
69
+ else:
70
+ raise KeyError("Employee data must contain a 'TIN' or 'Personal ID of Employee' column.")
71
 
72
  return df
73
 
74
  def process_salary_data(df):
75
+ """Process salary (earnings) data; convert non-key columns to numeric."""
76
  df = normalize_columns(df)
77
 
 
78
  if 'TIN' in df.columns:
79
  df['TIN'] = df['TIN'].apply(standardize_tin)
 
 
 
80
  else:
81
+ alt = get_column(df, ["TIN or Personal ID of Employee"])
82
+ if alt:
83
+ df['TIN'] = df[alt].apply(standardize_tin)
84
+ df.drop(columns=[alt], inplace=True)
85
+ else:
86
+ raise KeyError("Salary data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
87
 
88
+ # Convert non-key columns to numeric.
89
+ ignore_cols = {'TIN', 'Employee Name', 'Currency'}
 
90
  for col in df.columns:
91
  if col not in ignore_cols:
92
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
 
94
  return df
95
 
96
  def process_paye_data(df):
97
+ """Process PAYE data; convert non-key columns to numeric."""
98
  df = normalize_columns(df)
99
 
100
  if 'TIN' in df.columns:
101
  df['TIN'] = df['TIN'].apply(standardize_tin)
 
 
 
102
  else:
103
+ alt = get_column(df, ["TIN or Personal ID of Employee"])
104
+ if alt:
105
+ df['TIN'] = df[alt].apply(standardize_tin)
106
+ df.drop(columns=[alt], inplace=True)
107
+ else:
108
+ raise KeyError("PAYE data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
109
 
110
+ # Convert non-key columns to numeric.
111
  ignore_cols = {'TIN', 'Employed From date', 'Employed To date'}
112
  for col in df.columns:
113
  if col not in ignore_cols:
 
117
 
118
  def merge_dataframes(employee_df, salary_df, paye_df):
119
  """
120
+ Merge the three datasets using the salary (earnings) file as the master.
121
+ Employee and PAYE info are left-joined on 'TIN' onto the salary file.
122
+ Overlapping columns are combined so that non-missing values are retained.
123
  """
124
+ # Use salary_df as master.
125
+ merged_df = salary_df.copy()
126
+
127
+ # Merge employee data (rename duplicate columns with suffix _emp).
128
+ merged_df = merged_df.merge(employee_df, on='TIN', how='left', suffixes=('', '_emp'))
129
+ # Merge PAYE data (suffix _paye).
130
+ merged_df = merged_df.merge(paye_df, on='TIN', how='left', suffixes=('', '_paye'))
131
 
132
+ # For columns that appear as duplicate (e.g., "Employee Name" and "Employee Name_emp"),
133
+ # combine them using combine_first.
134
+ for col in list(merged_df.columns):
135
+ if col.endswith('_emp'):
136
+ base = col[:-4]
137
+ if base in merged_df.columns:
138
+ merged_df[base] = merged_df[base].combine_first(merged_df[col])
139
+ else:
140
+ merged_df.rename(columns={col: base}, inplace=True)
141
+ merged_df.drop(columns=[col], inplace=True)
142
+ elif col.endswith('_paye'):
143
+ base = col[:-5]
144
+ if base in merged_df.columns:
145
+ merged_df[base] = merged_df[base].combine_first(merged_df[col])
146
+ else:
147
+ merged_df.rename(columns={col: base}, inplace=True)
148
+ merged_df.drop(columns=[col], inplace=True)
149
 
150
+ # Fill any remaining NaN in numeric columns with 0.
151
+ numeric_columns = merged_df.select_dtypes(include=[np.number]).columns
152
+ merged_df[numeric_columns] = merged_df[numeric_columns].fillna(0)
 
 
 
 
 
 
 
153
 
154
  return merged_df
155
 
 
158
  st.write("""
159
  Upload the following files:
160
  1. Employee Information File (template)
161
+ 2. Salary (earnings) Information File – this file is the master
162
  3. PAYE Information File
163
  """)
164
 
165
  employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
166
+ salary_file = st.file_uploader("Upload Salary (Earnings) Information", type=['xlsx', 'xls'])
167
  paye_file = st.file_uploader("Upload PAYE Information", type=['xlsx', 'xls'])
168
 
169
  if employee_file and salary_file and paye_file:
170
  try:
 
 
171
  employee_df = read_excel_file(employee_file, header_option=0)
172
+ salary_df = read_excel_file(salary_file, header_option=0)
173
+ paye_df = read_excel_file(paye_file, header_option=0)
174
 
175
  employee_df = process_employee_data(employee_df)
176
+ salary_df = process_salary_data(salary_df)
177
+ paye_df = process_paye_data(paye_df)
178
 
179
  final_df = merge_dataframes(employee_df, salary_df, paye_df)
180
 
181
  st.subheader("Master Payroll Data Preview")
182
  st.dataframe(final_df)
183
 
184
+ # Prepare Excel file for download.
185
  output = BytesIO()
186
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
187
  final_df.to_excel(writer, index=False, sheet_name='Master Payroll')