Spaces:

rairo
/

OneExcelZimra

Sleeping

App Files Files Community

rairo commited on Feb 11, 2025

Commit

c484caf

verified ·

1 Parent(s): 9f460af

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -65

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ def standardize_tin(tin):
     if pd.isna(tin):
         return ""
     tin = str(tin).strip()
-    tin = re.sub(r'\s+', '', tin)  # Remove all spaces
     if re.match(r'^\d{2}-?\d{6}[A-Z]\d{2}$', tin):
         return f"{tin[:2]}-{tin[2:8]} {tin[8]} {tin[9:11]}"
     return tin
@@ -21,44 +21,61 @@ def clean_name(name):
     return " ".join(str(name).upper().strip().split())
 def normalize_columns(df):
-    """Replace newline characters and extra spaces in column headers."""
-    df.columns = [col.replace("\n", " ").strip() for col in df.columns]
     return df
 def process_employee_data(df):
-    """Process employee personal information."""
     df = normalize_columns(df)
-    # Create Employee Name if possible
     if 'First Name' in df.columns and 'Last Name' in df.columns:
         df['Employee Name'] = df.apply(
             lambda x: f"{clean_name(x['First Name'])} {clean_name(x['Last Name'])}",
             axis=1
         )
-    # Ensure TIN column exists using either of the known names.
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
     elif 'Personal ID of Employee' in df.columns:
         df['TIN'] = df['Personal ID of Employee'].apply(standardize_tin)
     else:
         raise KeyError("Employee data must contain a 'TIN' or 'Personal ID of Employee' column.")
     return df
 def process_salary_data(df):
-    """Process salary and deductions data."""
     df = normalize_columns(df)
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
     elif 'TIN or Personal ID of Employee' in df.columns:
         df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
     else:
         raise KeyError("Salary data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
-    numeric_columns = df.select_dtypes(include=[np.number]).columns
-    df[numeric_columns] = df[numeric_columns].fillna(0)
     return df
@@ -70,52 +87,50 @@ def process_paye_data(df):
         df['TIN'] = df['TIN'].apply(standardize_tin)
     elif 'TIN or Personal ID of Employee' in df.columns:
         df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
     else:
         raise KeyError("PAYE data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
-    numeric_columns = df.select_dtypes(include=[np.number]).columns
-    df[numeric_columns] = df[numeric_columns].fillna(0)
     return df
 def merge_dataframes(employee_df, salary_df, paye_df):
-    """Merge employee, salary, and PAYE information."""
-    # Merge salary (earnings) into employee data (earnings is the master)
-    merged_df = pd.merge(
-        employee_df,
-        salary_df,
-        on='TIN',
-        how='outer',
-        suffixes=('', '_salary')
-    )
-    # Merge PAYE into the merged dataset
-    merged_df = pd.merge(
-        merged_df,
-        paye_df,
-        on='TIN',
-        how='outer',
-        suffixes=('', '_paye')
-    )
-    # Drop duplicate columns (if any)
-    duplicate_cols = [col for col in merged_df.columns if col.endswith(('_salary', '_paye'))]
-    merged_df.drop(columns=duplicate_cols, inplace=True)
-    # Fill missing numeric values with 0
-    numeric_columns = merged_df.select_dtypes(include=[np.number]).columns
-    merged_df[numeric_columns] = merged_df[numeric_columns].fillna(0)
     return merged_df
 def main():
     st.title("Payroll Data Processor")
     st.write("""
-    Upload:
-    1. Employee Information File (Template)
-    2. Salary (Earnings) Information File
-    3. PAYE Information File
     """)
     employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
@@ -124,41 +139,25 @@ def main():
     if employee_file and salary_file and paye_file:
         try:
-            # Process employee data
-            employee_df = pd.read_excel(employee_file)
-            employee_df = process_employee_data(employee_df)
-            st.write("Employee data processed successfully")
-            # Process salary data
-            salary_df = pd.read_excel(salary_file)
             salary_df = process_salary_data(salary_df)
-            st.write("Salary data processed successfully")
-            # Process PAYE data
-            paye_df = pd.read_excel(paye_file)
             paye_df = process_paye_data(paye_df)
-            st.write("PAYE data processed successfully")
-            # Merge the dataframes
             final_df = merge_dataframes(employee_df, salary_df, paye_df)
-            # Organize columns in desired order
-            column_order = [
-                'TIN', 'Employee Name', 'First Name', 'Middle Name', 'Last Name',
-                'Birth Date', 'Employed From date', 'Employed To date', 'Position'
-            ]
-            remaining_cols = [col for col in final_df.columns if col not in column_order]
-            column_order.extend(remaining_cols)
-            final_df = final_df[column_order]
             st.subheader("Master Payroll Data Preview")
             st.dataframe(final_df)
-            # Prepare download
             output = BytesIO()
             with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                 final_df.to_excel(writer, index=False, sheet_name='Master Payroll')
             st.download_button(
                 label="Download Master Payroll Excel",
                 data=output.getvalue(),

     if pd.isna(tin):
         return ""
     tin = str(tin).strip()
+    tin = re.sub(r'\s+', '', tin)  # remove all spaces
     if re.match(r'^\d{2}-?\d{6}[A-Z]\d{2}$', tin):
         return f"{tin[:2]}-{tin[2:8]} {tin[8]} {tin[9:11]}"
     return tin
     return " ".join(str(name).upper().strip().split())
 def normalize_columns(df):
+    """Clean up column names: replace newline characters and extra spaces."""
+    df.columns = [str(col).replace("\n", " ").strip() for col in df.columns]
+    return df
+def read_excel_file(file, header_option=0):
+    """
+    Read an Excel file and normalize its column names.
+    If your file uses multi-row headers, consider setting header_option=[0,1]
+    and then flattening the MultiIndex.
+    """
+    df = pd.read_excel(file, header=header_option)
+    df = normalize_columns(df)
     return df
 def process_employee_data(df):
+    """Process employee personal information and create a clean TIN."""
     df = normalize_columns(df)
+    # Create Employee Name if possible.
     if 'First Name' in df.columns and 'Last Name' in df.columns:
         df['Employee Name'] = df.apply(
             lambda x: f"{clean_name(x['First Name'])} {clean_name(x['Last Name'])}",
             axis=1
         )
+    # Use either the "TIN" or "Personal ID of Employee" column.
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
     elif 'Personal ID of Employee' in df.columns:
         df['TIN'] = df['Personal ID of Employee'].apply(standardize_tin)
+        df.drop(columns=['Personal ID of Employee'], inplace=True)
     else:
         raise KeyError("Employee data must contain a 'TIN' or 'Personal ID of Employee' column.")
     return df
 def process_salary_data(df):
+    """Process salary (earnings) data."""
     df = normalize_columns(df)
+    # Get the TIN column from one of the expected names.
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
     elif 'TIN or Personal ID of Employee' in df.columns:
         df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
+        df.drop(columns=['TIN or Personal ID of Employee'], inplace=True)
     else:
         raise KeyError("Salary data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
+    # Convert columns (other than known text columns) to numeric.
+    ignore_cols = {'TIN', 'First Name', 'Middle Name', 'Last Name', 'Employee Name',
+                   'Birth Date', 'Employed From date', 'Employed To date', 'Position'}
+    for col in df.columns:
+        if col not in ignore_cols:
+            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
     return df
         df['TIN'] = df['TIN'].apply(standardize_tin)
     elif 'TIN or Personal ID of Employee' in df.columns:
         df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
+        df.drop(columns=['TIN or Personal ID of Employee'], inplace=True)
     else:
         raise KeyError("PAYE data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
+    # Convert columns (other than known text/date columns) to numeric.
+    ignore_cols = {'TIN', 'Employed From date', 'Employed To date'}
+    for col in df.columns:
+        if col not in ignore_cols:
+            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
     return df
 def merge_dataframes(employee_df, salary_df, paye_df):
+    """
+    Merge employee, salary, and PAYE data.
+    For overlapping columns (from salary and PAYE) we combine values so that nonzero
+    values are retained.
+    """
+    # Merge employee and salary data. (The earnings data is the master.)
+    merged_df = pd.merge(employee_df, salary_df, on='TIN', how='outer', suffixes=('', '_salary'))
+    # Merge PAYE data.
+    merged_df = pd.merge(merged_df, paye_df, on='TIN', how='outer', suffixes=('', '_paye'))
+    # Combine columns that were duplicated by the merge.
+    # For any column that appears as "Column", "Column_salary", and/or "Column_paye",
+    # we use nonzero (or non-null) values where available.
+    all_columns = list(merged_df.columns)
+    for col in all_columns:
+        for suffix in ['_salary', '_paye']:
+            dup_col = col + suffix
+            if dup_col in merged_df.columns:
+                merged_df[col] = merged_df[col].combine_first(merged_df[dup_col])
+                merged_df.drop(columns=[dup_col], inplace=True)
     return merged_df
 def main():
     st.title("Payroll Data Processor")
     st.write("""
+    Upload the following files:
+      1. Employee Information File (template)
+      2. Salary (earnings) Information File
+      3. PAYE Information File
     """)
     employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
     if employee_file and salary_file and paye_file:
         try:
+            # If your earnings/PAYE files have extra header rows (e.g. a row with currency codes),
+            # adjust header_option (e.g., header=[0,1]) and then flatten the columns.
+            employee_df = read_excel_file(employee_file, header_option=0)
+            salary_df = read_excel_file(salary_file, header_option=0)
+            paye_df = read_excel_file(paye_file, header_option=0)
+            employee_df = process_employee_data(employee_df)
             salary_df = process_salary_data(salary_df)
             paye_df = process_paye_data(paye_df)
             final_df = merge_dataframes(employee_df, salary_df, paye_df)
             st.subheader("Master Payroll Data Preview")
             st.dataframe(final_df)
+            # Prepare the Excel file for download.
             output = BytesIO()
             with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                 final_df.to_excel(writer, index=False, sheet_name='Master Payroll')
             st.download_button(
                 label="Download Master Payroll Excel",
                 data=output.getvalue(),