rairo commited on
Commit
e19ef0e
·
verified ·
1 Parent(s): 412766b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -113
app.py CHANGED
@@ -2,153 +2,140 @@ import streamlit as st
2
  import pandas as pd
3
  from io import BytesIO
4
  import numpy as np
 
5
 
6
- def normalize_column_name(col: str) -> str:
7
- """Standardize column names by removing extra spaces and newlines."""
8
- if not isinstance(col, str):
9
- return str(col)
10
- return " ".join(col.strip().replace("\n", " ").split())
11
-
12
- def split_currency_columns(df: pd.DataFrame) -> pd.DataFrame:
13
- """Split columns with USD/ZWL values into separate columns."""
14
- currency_columns = []
15
- for col in df.columns:
16
- if isinstance(col, str) and ('USD' in col.upper() or 'ZWL' in col.upper() or 'ZWG' in col.upper()):
17
- currency_columns.append(col)
18
-
19
- for col in currency_columns:
20
- base_name = col.replace('USD', '').replace('ZWL', '').replace('ZWG', '').strip()
21
- if 'USD' in col.upper():
22
- df.rename(columns={col: f"{base_name} USD"}, inplace=True)
23
- elif 'ZWL' in col.upper() or 'ZWG' in col.upper():
24
- df.rename(columns={col: f"{base_name} ZWL"}, inplace=True)
25
-
26
- return df
27
-
28
- def clean_tin(tin: str) -> str:
29
- """Clean and standardize TIN format."""
30
  if pd.isna(tin):
31
  return ""
32
  tin = str(tin).strip()
33
- # Remove extra spaces and standardize format
34
- return " ".join(tin.split())
 
 
 
 
35
 
36
- def clean_name(name: str) -> str:
37
  """Clean and standardize name format."""
38
  if pd.isna(name):
39
  return ""
40
- name = str(name).strip()
41
- # Convert to uppercase and remove extra spaces
42
- return " ".join(name.upper().split())
43
 
44
- def process_dataframe(df: pd.DataFrame, file_name: str) -> pd.DataFrame:
45
- """Process each uploaded file to ensure consistent format."""
46
- st.write(f"Processing file: **{file_name}**")
 
47
 
48
- # Normalize column names
49
- df.columns = [normalize_column_name(col) for col in df.columns]
 
 
 
50
 
51
- # Handle employee identification
52
- if "TIN or Personal ID of Employee" in df.columns:
53
- df.rename(columns={"TIN or Personal ID of Employee": "TIN"}, inplace=True)
54
- elif "Personal ID of Employee" in df.columns:
55
- df.rename(columns={"Personal ID of Employee": "TIN"}, inplace=True)
 
56
 
57
- # Create Employee Name if not present
58
- if "Employee Name" not in df.columns and "First Name" in df.columns and "Last Name" in df.columns:
59
- df["Employee Name"] = df["First Name"].fillna("") + " " + df["Last Name"].fillna("")
 
60
 
61
- # Clean TIN and Employee Name
62
- if "TIN" in df.columns:
63
- df["TIN"] = df["TIN"].apply(clean_tin)
64
- if "Employee Name" in df.columns:
65
- df["Employee Name"] = df["Employee Name"].apply(clean_name)
 
66
 
67
- # Split currency columns
68
- df = split_currency_columns(df)
 
 
69
 
70
- # Remove any completely empty rows
71
- df = df.dropna(how='all')
 
 
72
 
73
  return df
74
 
75
- def merge_dataframes(dfs: list) -> pd.DataFrame:
76
- """Merge all DataFrames using TIN and Employee Name as keys."""
77
- if not dfs:
78
- return pd.DataFrame()
 
79
 
80
- # Start with the first DataFrame
81
- master_df = dfs[0]
 
 
 
 
 
 
82
 
83
- # Merge with subsequent DataFrames
84
- for df in dfs[1:]:
85
- # Ensure key columns exist
86
- for col in ["TIN", "Employee Name"]:
87
- if col not in master_df.columns:
88
- master_df[col] = ""
89
- if col not in df.columns:
90
- df[col] = ""
91
-
92
- # Clean keys before merging
93
- master_df["TIN"] = master_df["TIN"].apply(clean_tin)
94
- master_df["Employee Name"] = master_df["Employee Name"].apply(clean_name)
95
- df["TIN"] = df["TIN"].apply(clean_tin)
96
- df["Employee Name"] = df["Employee Name"].apply(clean_name)
97
-
98
- # Merge using both TIN and Employee Name
99
- master_df = pd.merge(
100
- master_df, df,
101
- on=["TIN", "Employee Name"],
102
- how="outer",
103
- suffixes=("", "_drop")
104
- )
105
-
106
- # Remove duplicate columns
107
- drop_cols = [col for col in master_df.columns if col.endswith('_drop')]
108
- master_df.drop(columns=drop_cols, inplace=True)
109
 
110
- # Final cleanup
111
- master_df = master_df.replace({np.nan: "", None: ""})
 
112
 
113
- return master_df
114
 
115
  def main():
116
- st.title("Enhanced Payroll Data Processor")
 
117
  st.write("""
118
- Upload your payroll data files. The system will:
119
- 1. Standardize employee identification using TIN and Employee Name
120
- 2. Handle both USD and ZWL currency columns
121
- 3. Merge all data into a comprehensive master sheet
122
  """)
123
 
124
- uploaded_files = st.file_uploader(
125
- "Upload payroll data files",
126
- type=["xlsx", "xls"],
127
- accept_multiple_files=True
128
- )
129
 
130
- if uploaded_files:
131
- processed_dfs = []
132
- for file in uploaded_files:
133
- try:
134
- df = pd.read_excel(file)
135
- df = process_dataframe(df, file.name)
136
- processed_dfs.append(df)
137
- st.write(f"Successfully processed {file.name}")
138
- except Exception as e:
139
- st.error(f"Error processing {file.name}: {str(e)}")
140
- return
141
-
142
- if processed_dfs:
143
- master_df = merge_dataframes(processed_dfs)
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  st.subheader("Master Payroll Data Preview")
146
- st.dataframe(master_df)
147
 
148
  # Prepare download
149
  output = BytesIO()
150
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
151
- master_df.to_excel(writer, index=False, sheet_name='Master Payroll')
152
 
153
  st.download_button(
154
  label="Download Master Payroll Excel",
@@ -156,6 +143,9 @@ def main():
156
  file_name="master_payroll.xlsx",
157
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
158
  )
 
 
 
159
 
160
  if __name__ == "__main__":
161
  main()
 
2
  import pandas as pd
3
  from io import BytesIO
4
  import numpy as np
5
+ import re
6
 
7
+ def standardize_tin(tin):
8
+ """Standardize TIN format by removing extra spaces and standardizing format."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  if pd.isna(tin):
10
  return ""
11
  tin = str(tin).strip()
12
+ # Remove all spaces and standardize
13
+ tin = re.sub(r'\s+', '', tin)
14
+ # Add spaces back in standard format if it matches the pattern
15
+ if re.match(r'^\d{2}-?\d{6}[A-Z]\d{2}$', tin):
16
+ return f"{tin[:2]}-{tin[2:8]} {tin[8]} {tin[9:11]}"
17
+ return tin
18
 
19
+ def clean_name(name):
20
  """Clean and standardize name format."""
21
  if pd.isna(name):
22
  return ""
23
+ return " ".join(str(name).upper().strip().split())
 
 
24
 
25
+ def process_employee_data(df):
26
+ """Process employee personal information."""
27
+ # Standardize column names
28
+ df.columns = [col.strip() for col in df.columns]
29
 
30
+ # Extract employee details
31
+ required_columns = [
32
+ 'TIN', 'First Name', 'Middle Name', 'Last Name',
33
+ 'Birth Date', 'Employed From date', 'Employed To date', 'Position'
34
+ ]
35
 
36
+ # Create Employee Name
37
+ if 'First Name' in df.columns and 'Last Name' in df.columns:
38
+ df['Employee Name'] = df.apply(
39
+ lambda x: f"{clean_name(x['First Name'])} {clean_name(x['Last Name'])}",
40
+ axis=1
41
+ )
42
 
43
+ # Clean TIN
44
+ if 'TIN' in df.columns or 'Personal ID of Employee' in df.columns:
45
+ tin_col = 'TIN' if 'TIN' in df.columns else 'Personal ID of Employee'
46
+ df['TIN'] = df[tin_col].apply(standardize_tin)
47
 
48
+ return df
49
+
50
+ def process_salary_data(df):
51
+ """Process salary and deductions data."""
52
+ # Standardize column names
53
+ df.columns = [col.strip() for col in df.columns]
54
 
55
+ # Clean TIN column if present
56
+ if 'TIN' in df.columns or 'TIN or Personal ID of Employee' in df.columns:
57
+ tin_col = 'TIN' if 'TIN' in df.columns else 'TIN or Personal ID of Employee'
58
+ df['TIN'] = df[tin_col].apply(standardize_tin)
59
 
60
+ # Convert numeric columns
61
+ numeric_columns = df.select_dtypes(include=[np.number]).columns
62
+ for col in numeric_columns:
63
+ df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
64
 
65
  return df
66
 
67
+ def merge_dataframes(employee_df, salary_df):
68
+ """Merge employee and salary information."""
69
+ # Ensure TIN columns are standardized
70
+ employee_df['TIN'] = employee_df['TIN'].apply(standardize_tin)
71
+ salary_df['TIN'] = salary_df['TIN'].apply(standardize_tin)
72
 
73
+ # Merge on TIN
74
+ merged_df = pd.merge(
75
+ employee_df,
76
+ salary_df,
77
+ on='TIN',
78
+ how='outer',
79
+ suffixes=('', '_y')
80
+ )
81
 
82
+ # Drop duplicate columns
83
+ duplicate_cols = [col for col in merged_df.columns if col.endswith('_y')]
84
+ merged_df.drop(columns=duplicate_cols, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # Fill missing numeric values with 0
87
+ numeric_columns = merged_df.select_dtypes(include=[np.number]).columns
88
+ merged_df[numeric_columns] = merged_df[numeric_columns].fillna(0)
89
 
90
+ return merged_df
91
 
92
  def main():
93
+ st.title("Payroll Data Processor")
94
+
95
  st.write("""
96
+ Upload:
97
+ 1. Employee Information File (with personal details)
98
+ 2. Salary Information File (with financial data)
 
99
  """)
100
 
101
+ employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
102
+ salary_file = st.file_uploader("Upload Salary Information", type=['xlsx', 'xls'])
 
 
 
103
 
104
+ if employee_file and salary_file:
105
+ try:
106
+ # Process employee data
107
+ employee_df = pd.read_excel(employee_file)
108
+ employee_df = process_employee_data(employee_df)
109
+ st.write("Employee data processed successfully")
110
+
111
+ # Process salary data
112
+ salary_df = pd.read_excel(salary_file)
113
+ salary_df = process_salary_data(salary_df)
114
+ st.write("Salary data processed successfully")
115
+
116
+ # Merge the dataframes
117
+ final_df = merge_dataframes(employee_df, salary_df)
118
+
119
+ # Organize columns in desired order
120
+ column_order = [
121
+ 'TIN', 'Employee Name', 'First Name', 'Middle Name', 'Last Name',
122
+ 'Birth Date', 'Employed From date', 'Employed To date', 'Position'
123
+ ]
124
+
125
+ # Add remaining columns in their original order
126
+ remaining_cols = [col for col in final_df.columns if col not in column_order]
127
+ column_order.extend(remaining_cols)
128
+
129
+ # Reorder columns
130
+ final_df = final_df[column_order]
131
 
132
  st.subheader("Master Payroll Data Preview")
133
+ st.dataframe(final_df)
134
 
135
  # Prepare download
136
  output = BytesIO()
137
  with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
138
+ final_df.to_excel(writer, index=False, sheet_name='Master Payroll')
139
 
140
  st.download_button(
141
  label="Download Master Payroll Excel",
 
143
  file_name="master_payroll.xlsx",
144
  mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
145
  )
146
+
147
+ except Exception as e:
148
+ st.error(f"Error processing files: {str(e)}")
149
 
150
  if __name__ == "__main__":
151
  main()