Spaces:

rairo
/

OneExcelZimra

Sleeping

App Files Files Community

rairo commited on Feb 13, 2025

Commit

1fc5859

verified ·

1 Parent(s): c484caf

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -56

app.py CHANGED Viewed

@@ -27,52 +27,66 @@ def normalize_columns(df):
 def read_excel_file(file, header_option=0):
     """
-    Read an Excel file and normalize its column names.
-    If your file uses multi-row headers, consider setting header_option=[0,1]
-    and then flattening the MultiIndex.
     """
     df = pd.read_excel(file, header=header_option)
     df = normalize_columns(df)
     return df
 def process_employee_data(df):
-    """Process employee personal information and create a clean TIN."""
     df = normalize_columns(df)
-    # Create Employee Name if possible.
-    if 'First Name' in df.columns and 'Last Name' in df.columns:
-        df['Employee Name'] = df.apply(
-            lambda x: f"{clean_name(x['First Name'])} {clean_name(x['Last Name'])}",
-            axis=1
-        )
-    # Use either the "TIN" or "Personal ID of Employee" column.
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
-    elif 'Personal ID of Employee' in df.columns:
-        df['TIN'] = df['Personal ID of Employee'].apply(standardize_tin)
-        df.drop(columns=['Personal ID of Employee'], inplace=True)
     else:
-        raise KeyError("Employee data must contain a 'TIN' or 'Personal ID of Employee' column.")
     return df
 def process_salary_data(df):
-    """Process salary (earnings) data."""
     df = normalize_columns(df)
-    # Get the TIN column from one of the expected names.
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
-    elif 'TIN or Personal ID of Employee' in df.columns:
-        df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
-        df.drop(columns=['TIN or Personal ID of Employee'], inplace=True)
     else:
-        raise KeyError("Salary data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
-    # Convert columns (other than known text columns) to numeric.
-    ignore_cols = {'TIN', 'First Name', 'Middle Name', 'Last Name', 'Employee Name',
-                   'Birth Date', 'Employed From date', 'Employed To date', 'Position'}
     for col in df.columns:
         if col not in ignore_cols:
             df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
@@ -80,18 +94,20 @@ def process_salary_data(df):
     return df
 def process_paye_data(df):
-    """Process PAYE data."""
     df = normalize_columns(df)
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
-    elif 'TIN or Personal ID of Employee' in df.columns:
-        df['TIN'] = df['TIN or Personal ID of Employee'].apply(standardize_tin)
-        df.drop(columns=['TIN or Personal ID of Employee'], inplace=True)
     else:
-        raise KeyError("PAYE data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
-    # Convert columns (other than known text/date columns) to numeric.
     ignore_cols = {'TIN', 'Employed From date', 'Employed To date'}
     for col in df.columns:
         if col not in ignore_cols:
@@ -101,26 +117,39 @@ def process_paye_data(df):
 def merge_dataframes(employee_df, salary_df, paye_df):
     """
-    Merge employee, salary, and PAYE data.
-    For overlapping columns (from salary and PAYE) we combine values so that nonzero
-    values are retained.
     """
-    # Merge employee and salary data. (The earnings data is the master.)
-    merged_df = pd.merge(employee_df, salary_df, on='TIN', how='outer', suffixes=('', '_salary'))
-    # Merge PAYE data.
-    merged_df = pd.merge(merged_df, paye_df, on='TIN', how='outer', suffixes=('', '_paye'))
-    # Combine columns that were duplicated by the merge.
-    # For any column that appears as "Column", "Column_salary", and/or "Column_paye",
-    # we use nonzero (or non-null) values where available.
-    all_columns = list(merged_df.columns)
-    for col in all_columns:
-        for suffix in ['_salary', '_paye']:
-            dup_col = col + suffix
-            if dup_col in merged_df.columns:
-                merged_df[col] = merged_df[col].combine_first(merged_df[dup_col])
-                merged_df.drop(columns=[dup_col], inplace=True)
     return merged_df
@@ -129,32 +158,30 @@ def main():
     st.write("""
     Upload the following files:
       1. Employee Information File (template)
-      2. Salary (earnings) Information File
       3. PAYE Information File
     """)
     employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
-    salary_file = st.file_uploader("Upload Salary Information", type=['xlsx', 'xls'])
     paye_file = st.file_uploader("Upload PAYE Information", type=['xlsx', 'xls'])
     if employee_file and salary_file and paye_file:
         try:
-            # If your earnings/PAYE files have extra header rows (e.g. a row with currency codes),
-            # adjust header_option (e.g., header=[0,1]) and then flatten the columns.
             employee_df = read_excel_file(employee_file, header_option=0)
-            salary_df = read_excel_file(salary_file, header_option=0)
-            paye_df = read_excel_file(paye_file, header_option=0)
             employee_df = process_employee_data(employee_df)
-            salary_df = process_salary_data(salary_df)
-            paye_df = process_paye_data(paye_df)
             final_df = merge_dataframes(employee_df, salary_df, paye_df)
             st.subheader("Master Payroll Data Preview")
             st.dataframe(final_df)
-            # Prepare the Excel file for download.
             output = BytesIO()
             with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                 final_df.to_excel(writer, index=False, sheet_name='Master Payroll')

 def read_excel_file(file, header_option=0):
     """
+    Read an Excel file, normalize column names,
+    and drop completely empty rows/columns.
     """
     df = pd.read_excel(file, header=header_option)
     df = normalize_columns(df)
+    df.dropna(axis=0, how='all', inplace=True)
+    df.dropna(axis=1, how='all', inplace=True)
     return df
+def get_column(df, possible_names):
+    """
+    Return the first matching column name (case-insensitive) from df.columns.
+    If none is found, return None.
+    """
+    lower_cols = {col.lower(): col for col in df.columns}
+    for name in possible_names:
+        if name.lower() in lower_cols:
+            return lower_cols[name.lower()]
+    return None
 def process_employee_data(df):
+    """Process employee personal information; create clean TIN and Employee Name."""
     df = normalize_columns(df)
+    # Create Employee Name if not present by combining first and last name.
+    if 'Employee Name' not in df.columns or df['Employee Name'].isna().all():
+        first_name_col = get_column(df, ["First Name", "First", "Forename"])
+        last_name_col = get_column(df, ["Last Name", "Surname", "Family Name", "Last"])
+        if first_name_col and last_name_col:
+            df["Employee Name"] = df[first_name_col].apply(clean_name) + " " + df[last_name_col].apply(clean_name)
+    # Standardize TIN using one of the expected headers.
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
     else:
+        alt = get_column(df, ["Personal ID of Employee"])
+        if alt:
+            df['TIN'] = df[alt].apply(standardize_tin)
+            df.drop(columns=[alt], inplace=True)
+        else:
+            raise KeyError("Employee data must contain a 'TIN' or 'Personal ID of Employee' column.")
     return df
 def process_salary_data(df):
+    """Process salary (earnings) data; convert non-key columns to numeric."""
     df = normalize_columns(df)
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
     else:
+        alt = get_column(df, ["TIN or Personal ID of Employee"])
+        if alt:
+            df['TIN'] = df[alt].apply(standardize_tin)
+            df.drop(columns=[alt], inplace=True)
+        else:
+            raise KeyError("Salary data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
+    # Convert non-key columns to numeric.
+    ignore_cols = {'TIN', 'Employee Name', 'Currency'}
     for col in df.columns:
         if col not in ignore_cols:
             df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
     return df
 def process_paye_data(df):
+    """Process PAYE data; convert non-key columns to numeric."""
     df = normalize_columns(df)
     if 'TIN' in df.columns:
         df['TIN'] = df['TIN'].apply(standardize_tin)
     else:
+        alt = get_column(df, ["TIN or Personal ID of Employee"])
+        if alt:
+            df['TIN'] = df[alt].apply(standardize_tin)
+            df.drop(columns=[alt], inplace=True)
+        else:
+            raise KeyError("PAYE data must contain a 'TIN' or 'TIN or Personal ID of Employee' column.")
+    # Convert non-key columns to numeric.
     ignore_cols = {'TIN', 'Employed From date', 'Employed To date'}
     for col in df.columns:
         if col not in ignore_cols:
 def merge_dataframes(employee_df, salary_df, paye_df):
     """
+    Merge the three datasets using the salary (earnings) file as the master.
+    Employee and PAYE info are left-joined on 'TIN' onto the salary file.
+    Overlapping columns are combined so that non-missing values are retained.
     """
+    # Use salary_df as master.
+    merged_df = salary_df.copy()
+    # Merge employee data (rename duplicate columns with suffix _emp).
+    merged_df = merged_df.merge(employee_df, on='TIN', how='left', suffixes=('', '_emp'))
+    # Merge PAYE data (suffix _paye).
+    merged_df = merged_df.merge(paye_df, on='TIN', how='left', suffixes=('', '_paye'))
+    # For columns that appear as duplicate (e.g., "Employee Name" and "Employee Name_emp"),
+    # combine them using combine_first.
+    for col in list(merged_df.columns):
+        if col.endswith('_emp'):
+            base = col[:-4]
+            if base in merged_df.columns:
+                merged_df[base] = merged_df[base].combine_first(merged_df[col])
+            else:
+                merged_df.rename(columns={col: base}, inplace=True)
+            merged_df.drop(columns=[col], inplace=True)
+        elif col.endswith('_paye'):
+            base = col[:-5]
+            if base in merged_df.columns:
+                merged_df[base] = merged_df[base].combine_first(merged_df[col])
+            else:
+                merged_df.rename(columns={col: base}, inplace=True)
+            merged_df.drop(columns=[col], inplace=True)
+    # Fill any remaining NaN in numeric columns with 0.
+    numeric_columns = merged_df.select_dtypes(include=[np.number]).columns
+    merged_df[numeric_columns] = merged_df[numeric_columns].fillna(0)
     return merged_df
     st.write("""
     Upload the following files:
       1. Employee Information File (template)
+      2. Salary (earnings) Information File – this file is the master
       3. PAYE Information File
     """)
     employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
+    salary_file = st.file_uploader("Upload Salary (Earnings) Information", type=['xlsx', 'xls'])
     paye_file = st.file_uploader("Upload PAYE Information", type=['xlsx', 'xls'])
     if employee_file and salary_file and paye_file:
         try:
             employee_df = read_excel_file(employee_file, header_option=0)
+            salary_df   = read_excel_file(salary_file, header_option=0)
+            paye_df     = read_excel_file(paye_file, header_option=0)
             employee_df = process_employee_data(employee_df)
+            salary_df   = process_salary_data(salary_df)
+            paye_df     = process_paye_data(paye_df)
             final_df = merge_dataframes(employee_df, salary_df, paye_df)
             st.subheader("Master Payroll Data Preview")
             st.dataframe(final_df)
+            # Prepare Excel file for download.
             output = BytesIO()
             with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                 final_df.to_excel(writer, index=False, sheet_name='Master Payroll')