rairo commited on
Commit
c484caf
·
verified ·
1 Parent(s): 9f460af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -65
app.py CHANGED
@@ -9,7 +9,7 @@ def standardize_tin(tin):
9
  if pd.isna(tin):
10
  return ""
11
  tin = str(tin).strip()
12
- tin = re.sub(r'\s+', '', tin) # Remove all spaces
13
  if re.match(r'^\d{2}-?\d{6}[A-Z]\d{2}$', tin):
14
  return f"{tin[:2]}-{tin[2:8]} {tin[8]} {tin[9:11]}"
15
  return tin
@@ -21,44 +21,61 @@ def clean_name(name):
21
  return " ".join(str(name).upper().strip().split())
22
 
23
  def normalize_columns(df):
24
- """Replace newline characters and extra spaces in column headers."""
25
- df.columns = [col.replace("\n", " ").strip() for col in df.columns]
 
 
 
 
 
 
 
 
 
 
26
  return df
27
 
28
  def process_employee_data(df):
29
- """Process employee personal information."""
30
  df = normalize_columns(df)
31
 
32
- # Create Employee Name if possible
33
  if 'First Name' in df.columns and 'Last Name' in df.columns:
34
  df['Employee Name'] = df.apply(
35
  lambda x: f"{clean_name(x['First Name'])} {clean_name(x['Last Name'])}",
36
  axis=1
37
  )
38
 
39
- # Ensure TIN column exists using either of the known names.
40
  if 'TIN' in df.columns:
41
  df['TIN'] = df['TIN'].apply(standardize_tin)
42
  elif 'Personal ID of Employee' in df.columns:
43
  df['TIN'] = df['Personal ID of Employee'].apply(standardize_tin)
 
44
  else:
45
  raise KeyError("Employee data must contain a 'TIN' or 'Personal ID of Employee' column.")
46
 
47
  return df
48
 
49
  def process_salary_data(df):
50
- """Process salary and deductions data."""
51
  df = normalize_columns(df)
52
 
 
53
  if 'TIN' in df.columns:
54
  df['TIN'] = df['TIN'].apply(standardize_tin)
55
  elif 'TIN or Personal ID of Employee' in df.columns:
56
  df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
 
57
  else:
58
  raise KeyError("Salary data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
59
 
60
- numeric_columns = df.select_dtypes(include=[np.number]).columns
61
- df[numeric_columns] = df[numeric_columns].fillna(0)
 
 
 
 
62
 
63
  return df
64
 
@@ -70,52 +87,50 @@ def process_paye_data(df):
70
  df['TIN'] = df['TIN'].apply(standardize_tin)
71
  elif 'TIN or Personal ID of Employee' in df.columns:
72
  df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
 
73
  else:
74
  raise KeyError("PAYE data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
75
 
76
- numeric_columns = df.select_dtypes(include=[np.number]).columns
77
- df[numeric_columns] = df[numeric_columns].fillna(0)
 
 
 
78
 
79
  return df
80
 
81
  def merge_dataframes(employee_df, salary_df, paye_df):
82
- """Merge employee, salary, and PAYE information."""
83
- # Merge salary (earnings) into employee data (earnings is the master)
84
- merged_df = pd.merge(
85
- employee_df,
86
- salary_df,
87
- on='TIN',
88
- how='outer',
89
- suffixes=('', '_salary')
90
- )
91
-
92
- # Merge PAYE into the merged dataset
93
- merged_df = pd.merge(
94
- merged_df,
95
- paye_df,
96
- on='TIN',
97
- how='outer',
98
- suffixes=('', '_paye')
99
- )
100
-
101
- # Drop duplicate columns (if any)
102
- duplicate_cols = [col for col in merged_df.columns if col.endswith(('_salary', '_paye'))]
103
- merged_df.drop(columns=duplicate_cols, inplace=True)
104
-
105
- # Fill missing numeric values with 0
106
- numeric_columns = merged_df.select_dtypes(include=[np.number]).columns
107
- merged_df[numeric_columns] = merged_df[numeric_columns].fillna(0)
108
 
109
  return merged_df
110
 
111
  def main():
112
  st.title("Payroll Data Processor")
113
-
114
  st.write("""
115
- Upload:
116
- 1. Employee Information File (Template)
117
- 2. Salary (Earnings) Information File
118
- 3. PAYE Information File
119
  """)
120
 
121
  employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
@@ -124,41 +139,25 @@ def main():
124
 
125
  if employee_file and salary_file and paye_file:
126
  try:
127
- # Process employee data
128
- employee_df = pd.read_excel(employee_file)
129
- employee_df = process_employee_data(employee_df)
130
- st.write("Employee data processed successfully")
 
131
 
132
- # Process salary data
133
- salary_df = pd.read_excel(salary_file)
134
  salary_df = process_salary_data(salary_df)
135
- st.write("Salary data processed successfully")
136
-
137
- # Process PAYE data
138
- paye_df = pd.read_excel(paye_file)
139
  paye_df = process_paye_data(paye_df)
140
- st.write("PAYE data processed successfully")
141
 
142
- # Merge the dataframes
143
  final_df = merge_dataframes(employee_df, salary_df, paye_df)
144
 
145
- # Organize columns in desired order
146
- column_order = [
147
- 'TIN', 'Employee Name', 'First Name', 'Middle Name', 'Last Name',
148
- 'Birth Date', 'Employed From date', 'Employed To date', 'Position'
149
- ]
150
- remaining_cols = [col for col in final_df.columns if col not in column_order]
151
- column_order.extend(remaining_cols)
152
- final_df = final_df[column_order]
153
-
154
  st.subheader("Master Payroll Data Preview")
155
  st.dataframe(final_df)
156
 
157
- # Prepare download
158
  output = BytesIO()
159
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
160
  final_df.to_excel(writer, index=False, sheet_name='Master Payroll')
161
-
162
  st.download_button(
163
  label="Download Master Payroll Excel",
164
  data=output.getvalue(),
 
9
  if pd.isna(tin):
10
  return ""
11
  tin = str(tin).strip()
12
+ tin = re.sub(r'\s+', '', tin) # remove all spaces
13
  if re.match(r'^\d{2}-?\d{6}[A-Z]\d{2}$', tin):
14
  return f"{tin[:2]}-{tin[2:8]} {tin[8]} {tin[9:11]}"
15
  return tin
 
21
  return " ".join(str(name).upper().strip().split())
22
 
23
  def normalize_columns(df):
24
+ """Clean up column names: replace newline characters and extra spaces."""
25
+ df.columns = [str(col).replace("\n", " ").strip() for col in df.columns]
26
+ return df
27
+
28
+ def read_excel_file(file, header_option=0):
29
+ """
30
+ Read an Excel file and normalize its column names.
31
+ If your file uses multi-row headers, consider setting header_option=[0,1]
32
+ and then flattening the MultiIndex.
33
+ """
34
+ df = pd.read_excel(file, header=header_option)
35
+ df = normalize_columns(df)
36
  return df
37
 
38
  def process_employee_data(df):
39
+ """Process employee personal information and create a clean TIN."""
40
  df = normalize_columns(df)
41
 
42
+ # Create Employee Name if possible.
43
  if 'First Name' in df.columns and 'Last Name' in df.columns:
44
  df['Employee Name'] = df.apply(
45
  lambda x: f"{clean_name(x['First Name'])} {clean_name(x['Last Name'])}",
46
  axis=1
47
  )
48
 
49
+ # Use either the "TIN" or "Personal ID of Employee" column.
50
  if 'TIN' in df.columns:
51
  df['TIN'] = df['TIN'].apply(standardize_tin)
52
  elif 'Personal ID of Employee' in df.columns:
53
  df['TIN'] = df['Personal ID of Employee'].apply(standardize_tin)
54
+ df.drop(columns=['Personal ID of Employee'], inplace=True)
55
  else:
56
  raise KeyError("Employee data must contain a 'TIN' or 'Personal ID of Employee' column.")
57
 
58
  return df
59
 
60
  def process_salary_data(df):
61
+ """Process salary (earnings) data."""
62
  df = normalize_columns(df)
63
 
64
+ # Get the TIN column from one of the expected names.
65
  if 'TIN' in df.columns:
66
  df['TIN'] = df['TIN'].apply(standardize_tin)
67
  elif 'TIN or Personal ID of Employee' in df.columns:
68
  df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
69
+ df.drop(columns=['TIN or Personal ID of Employee'], inplace=True)
70
  else:
71
  raise KeyError("Salary data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
72
 
73
+ # Convert columns (other than known text columns) to numeric.
74
+ ignore_cols = {'TIN', 'First Name', 'Middle Name', 'Last Name', 'Employee Name',
75
+ 'Birth Date', 'Employed From date', 'Employed To date', 'Position'}
76
+ for col in df.columns:
77
+ if col not in ignore_cols:
78
+ df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
79
 
80
  return df
81
 
 
87
  df['TIN'] = df['TIN'].apply(standardize_tin)
88
  elif 'TIN or Personal ID of Employee' in df.columns:
89
  df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
90
+ df.drop(columns=['TIN or Personal ID of Employee'], inplace=True)
91
  else:
92
  raise KeyError("PAYE data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
93
 
94
+ # Convert columns (other than known text/date columns) to numeric.
95
+ ignore_cols = {'TIN', 'Employed From date', 'Employed To date'}
96
+ for col in df.columns:
97
+ if col not in ignore_cols:
98
+ df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
99
 
100
  return df
101
 
102
  def merge_dataframes(employee_df, salary_df, paye_df):
103
+ """
104
+ Merge employee, salary, and PAYE data.
105
+ For overlapping columns (from salary and PAYE) we combine values so that nonzero
106
+ values are retained.
107
+ """
108
+ # Merge employee and salary data. (The earnings data is the master.)
109
+ merged_df = pd.merge(employee_df, salary_df, on='TIN', how='outer', suffixes=('', '_salary'))
110
+
111
+ # Merge PAYE data.
112
+ merged_df = pd.merge(merged_df, paye_df, on='TIN', how='outer', suffixes=('', '_paye'))
113
+
114
+ # Combine columns that were duplicated by the merge.
115
+ # For any column that appears as "Column", "Column_salary", and/or "Column_paye",
116
+ # we use nonzero (or non-null) values where available.
117
+ all_columns = list(merged_df.columns)
118
+ for col in all_columns:
119
+ for suffix in ['_salary', '_paye']:
120
+ dup_col = col + suffix
121
+ if dup_col in merged_df.columns:
122
+ merged_df[col] = merged_df[col].combine_first(merged_df[dup_col])
123
+ merged_df.drop(columns=[dup_col], inplace=True)
 
 
 
 
 
124
 
125
  return merged_df
126
 
127
  def main():
128
  st.title("Payroll Data Processor")
 
129
  st.write("""
130
+ Upload the following files:
131
+ 1. Employee Information File (template)
132
+ 2. Salary (earnings) Information File
133
+ 3. PAYE Information File
134
  """)
135
 
136
  employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
 
139
 
140
  if employee_file and salary_file and paye_file:
141
  try:
142
+ # If your earnings/PAYE files have extra header rows (e.g. a row with currency codes),
143
+ # adjust header_option (e.g., header=[0,1]) and then flatten the columns.
144
+ employee_df = read_excel_file(employee_file, header_option=0)
145
+ salary_df = read_excel_file(salary_file, header_option=0)
146
+ paye_df = read_excel_file(paye_file, header_option=0)
147
 
148
+ employee_df = process_employee_data(employee_df)
 
149
  salary_df = process_salary_data(salary_df)
 
 
 
 
150
  paye_df = process_paye_data(paye_df)
 
151
 
 
152
  final_df = merge_dataframes(employee_df, salary_df, paye_df)
153
 
 
 
 
 
 
 
 
 
 
154
  st.subheader("Master Payroll Data Preview")
155
  st.dataframe(final_df)
156
 
157
+ # Prepare the Excel file for download.
158
  output = BytesIO()
159
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
160
  final_df.to_excel(writer, index=False, sheet_name='Master Payroll')
 
161
  st.download_button(
162
  label="Download Master Payroll Excel",
163
  data=output.getvalue(),