Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 24, 2025

Commit

a66eb56

verified ·

1 Parent(s): fc7b3ea

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -23

app.py CHANGED Viewed

@@ -39,33 +39,35 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Standardize DataFrame column names and data types:
     - Drops any middle name columns.
-    - Cleans all column names (e.g., "Employee Name" -> "employee_name").
-    - Renames synonyms to common names (e.g., 'tin', 'salary').
     - Creates an 'employee_name' column if missing but first_name and last_name exist.
     - Combines duplicate key columns into one.
-    - Forces the key columns 'tin' and 'employee_name' to be strings.
     """
     # Drop any column that appears to be a middle name
     middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
     if middle_name_cols:
         df = df.drop(columns=middle_name_cols)
-    # Clean all column names first so that "Employee Name" becomes "employee_name"
     df.columns = [clean_column_name(col) for col in df.columns]
-    # Rename columns based on synonyms for TIN and salary
     rename_map = {}
     for col in df.columns:
         if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
             rename_map[col] = 'tin'
-        elif 'tin' in col:
             rename_map[col] = 'tin'
         if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
             rename_map[col] = 'salary'
     if rename_map:
         df = df.rename(columns=rename_map)
-    # Combine duplicate columns (e.g., multiple salary or tin columns)
     if 'salary' in df.columns and list(df.columns).count('salary') > 1:
         salary_cols = [col for col in df.columns if col == 'salary']
         df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
@@ -75,11 +77,11 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
         df = df.loc[:, ~df.columns.duplicated()]
-    # If employee_name is missing and first_name and last_name exist, create it.
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
-    # Ensure key columns are of the correct type
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
     if 'tin' in df.columns:
@@ -179,15 +181,16 @@ def read_excel_file(file) -> pd.DataFrame:
 def merge_with_master(processed_files):
     """
     Merge multiple DataFrames using a two-step process:
-    1. Use the earnings file as the master. Drop its inaccurate 'tin' column.
-    2. Merge template info onto earnings via 'employee_name' (the key provided by "Employee Name").
-    3. Then merge the resulting DataFrame with the PAYE file using the (correct) 'tin' key.
     """
     earnings_file = None
     paye_file = None
     template_file = None
-    # Identify files based on filename keywords
     for file_info in processed_files:
         lower_filename = file_info["filename"].lower()
         if "earnings" in lower_filename:
@@ -196,16 +199,17 @@ def merge_with_master(processed_files):
             paye_file = file_info
         elif "template" in lower_filename:
             template_file = file_info
     if not earnings_file:
         st.warning("No earnings file found as master. Using the first file as master.")
         earnings_file = processed_files[0]
-    # Use the earnings DataFrame as the master
     earnings_df = earnings_file["df"]
-    # Drop the inaccurate 'tin' column from earnings, if present
     if 'tin' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['tin'])
-    # Double-check removal of any middle_name column (should already be done in standardization)
     if 'middle_name' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['middle_name'])
@@ -215,10 +219,10 @@ def merge_with_master(processed_files):
     if template_file is not None:
         st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
         template_df = template_file["df"]
-        # Drop any middle_name column from the template file
         if 'middle_name' in template_df.columns:
             template_df = template_df.drop(columns=['middle_name'])
-        # Ensure template has an 'employee_name' column (constructed if necessary)
         if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
             template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
         if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
@@ -226,16 +230,21 @@ def merge_with_master(processed_files):
         else:
             st.warning("Column 'employee_name' missing in either the earnings or template file. Skipping template merge.")
     else:
-        st.warning("No template file detected.")
     # Merge PAYE figures onto the merged DataFrame using 'tin'
     if paye_file is not None:
         st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
         paye_df = paye_file["df"]
-        if 'tin' in merged_df.columns and 'tin' in paye_df.columns:
             merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
         else:
-            st.warning("Column 'tin' missing in either the merged or PAYE file. Skipping PAYE merge.")
     else:
         st.warning("No PAYE file detected.")
@@ -268,7 +277,7 @@ def main():
                     if df.empty:
                         st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
                         continue
-                    # Standardize column names and key columns
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
@@ -277,7 +286,7 @@ def main():
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
-                        # Apply suggested renames if provided by the analysis
                         if 'suggested_renames' in analysis:
                             df = df.rename(columns=analysis['suggested_renames'])
                         processed_files.append(

     """
     Standardize DataFrame column names and data types:
     - Drops any middle name columns.
+    - Cleans all column names (e.g., "Employee Name" becomes "employee_name").
+    - Renames synonyms to common names (e.g., mapping TIN-related columns to 'tin'
+      and salary-related columns to 'salary').
     - Creates an 'employee_name' column if missing but first_name and last_name exist.
     - Combines duplicate key columns into one.
+    - Forces key columns (tin and employee_name) to be strings.
     """
     # Drop any column that appears to be a middle name
     middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
     if middle_name_cols:
         df = df.drop(columns=middle_name_cols)
+    # Clean all column names so that "Employee Name" becomes "employee_name", etc.
     df.columns = [clean_column_name(col) for col in df.columns]
+    # Build a rename map for TIN and salary synonyms.
+    # Note: This will capture PAYE's "tin_or_personal_id_of_employee" too.
     rename_map = {}
     for col in df.columns:
         if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
             rename_map[col] = 'tin'
+        elif 'tin' in col and 'tin' not in rename_map.get(col, ''):
             rename_map[col] = 'tin'
         if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
             rename_map[col] = 'salary'
     if rename_map:
         df = df.rename(columns=rename_map)
+    # Combine duplicate columns for salary and tin if needed
     if 'salary' in df.columns and list(df.columns).count('salary') > 1:
         salary_cols = [col for col in df.columns if col == 'salary']
         df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
         df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
         df = df.loc[:, ~df.columns.duplicated()]
+    # If employee_name is missing but first_name and last_name exist, create it.
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
+    # Ensure key columns are of the proper type.
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
     if 'tin' in df.columns:
 def merge_with_master(processed_files):
     """
     Merge multiple DataFrames using a two-step process:
+    1. Use the earnings file as master and drop its inaccurate 'tin' column.
+    2. Merge template info onto earnings using 'employee_name' (the key provided by "Employee Name").
+       The trusted 'tin' comes from the template file.
+    3. Merge the combined earnings–template DataFrame with the PAYE file using 'tin'.
     """
     earnings_file = None
     paye_file = None
     template_file = None
+    # Identify files based on filename keywords.
     for file_info in processed_files:
         lower_filename = file_info["filename"].lower()
         if "earnings" in lower_filename:
             paye_file = file_info
         elif "template" in lower_filename:
             template_file = file_info
     if not earnings_file:
         st.warning("No earnings file found as master. Using the first file as master.")
         earnings_file = processed_files[0]
+    # Use the earnings DataFrame as the master.
     earnings_df = earnings_file["df"]
+    # Drop the inaccurate 'tin' column from earnings if it exists.
     if 'tin' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['tin'])
+    # Double-check removal of any middle_name column (should be done in standardization).
     if 'middle_name' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['middle_name'])
     if template_file is not None:
         st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
         template_df = template_file["df"]
+        # Drop any middle_name column from the template file.
         if 'middle_name' in template_df.columns:
             template_df = template_df.drop(columns=['middle_name'])
+        # Ensure template has an 'employee_name' column (construct if necessary).
         if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
             template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
         if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
         else:
             st.warning("Column 'employee_name' missing in either the earnings or template file. Skipping template merge.")
     else:
+        st.warning("No template file detected. Cannot proceed without a trusted TIN from the template.")
+    # After merging, check that a trusted 'tin' is present from the template.
+    if 'tin' not in merged_df.columns:
+        st.error("No trusted 'tin' column found in the merged earnings-template data. Aborting further merge.")
+        return merged_df
     # Merge PAYE figures onto the merged DataFrame using 'tin'
     if paye_file is not None:
         st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
         paye_df = paye_file["df"]
+        if 'tin' in paye_df.columns:
             merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
         else:
+            st.warning("Column 'tin' missing in the PAYE file. Skipping PAYE merge.")
     else:
         st.warning("No PAYE file detected.")
                     if df.empty:
                         st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
                         continue
+                    # Standardize columns and key identifiers.
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
+                        # Apply any suggested renames from the analysis.
                         if 'suggested_renames' in analysis:
                             df = df.rename(columns=analysis['suggested_renames'])
                         processed_files.append(