Spaces:

rairo
/

OneExcelZimra

Sleeping

App Files Files Community

rairo commited on Feb 21, 2025

Commit

42db88a

verified ·

1 Parent(s): 96d2597

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -9

app.py CHANGED Viewed

@@ -11,6 +11,29 @@ def clean_column_name(col_name):
     cleaned = re.sub(r"[^\w\s]", " ", col_name)
     return re.sub(r"\s+", "_", cleaned.strip().lower())
 def read_file(file) -> pd.DataFrame:
     """Read a CSV or Excel file into a DataFrame."""
     try:
@@ -28,12 +51,16 @@ def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
 def main():
     st.title("Merge Employee Name from Earnings into PAYE Sheet")
-    st.write("Upload an Earnings Sheet and a PAYE Sheet. The app will extract the first two columns (TIN and Employee Name) from the Earnings Sheet and merge the Employee Name into the PAYE sheet based on matching TIN.")
     earnings_file = st.file_uploader("Upload Earnings Sheet", type=["csv", "xlsx", "xls"], key="earnings")
     paye_file = st.file_uploader("Upload PAYE Sheet", type=["csv", "xlsx", "xls"], key="paye")
     if earnings_file and paye_file:
         earnings_df = read_file(earnings_file)
         paye_df = read_file(paye_file)
@@ -41,32 +68,42 @@ def main():
             st.error("One of the files could not be read. Please check the files and try again.")
             return
-        # Clean column names for both dataframes
         earnings_df.columns = [clean_column_name(col) for col in earnings_df.columns]
         paye_df.columns = [clean_column_name(col) for col in paye_df.columns]
-        # Check that earnings file has at least two columns
         if earnings_df.shape[1] < 2:
             st.error("Earnings sheet must have at least two columns (TIN and Employee Name).")
             return
-        # Extract the first two columns from the earnings sheet.
-        # We assume the first column is TIN and the second is Employee Name.
         earnings_subset = earnings_df.iloc[:, :2].copy()
         earnings_subset.columns = ["tin", "employee_name"]
         st.write("Preview of extracted TIN and Employee Name from Earnings Sheet:")
         st.dataframe(safe_display_df(earnings_subset.head()))
-        # Merge the PAYE sheet with the extracted employee names based on the 'tin' column.
         if "tin" not in paye_df.columns:
-            st.error("The PAYE sheet does not have a 'tin' column to merge on.")
             return
         merged_df = paye_df.merge(earnings_subset, on="tin", how="left")
         st.write("### Merged PAYE Sheet with Employee Name")
         st.dataframe(safe_display_df(merged_df.head()))
-        # Provide option to download the merged data as CSV.
         csv_data = merged_df.to_csv(index=False).encode("utf-8")
         st.download_button(
             label="Download Merged CSV",
@@ -74,7 +111,6 @@ def main():
             file_name="merged_paye.csv",
             mime="text/csv"
         )
         st.write(f"Total rows in merged data: {len(merged_df)}")
     else:
         st.info("Please upload both an Earnings Sheet and a PAYE Sheet.")

     cleaned = re.sub(r"[^\w\s]", " ", col_name)
     return re.sub(r"\s+", "_", cleaned.strip().lower())
+def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Standardize DataFrame column names.
+    If a column name contains "tin" or variants of personal id,
+    rename it to 'tin'. Also, strip trailing spaces from all string values.
+    """
+    rename_map = {}
+    for col in df.columns:
+        col_lower = col.lower()
+        # Check for various forms of TIN column name.
+        if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
+            rename_map[col] = 'tin'
+        elif 'tin' in col_lower:
+            rename_map[col] = 'tin'
+    if rename_map:
+        df = df.rename(columns=rename_map)
+    # Strip trailing spaces from string values in every column.
+    for col in df.columns:
+        if df[col].dtype == object:
+            df[col] = df[col].astype(str).str.strip()
+    return df
 def read_file(file) -> pd.DataFrame:
     """Read a CSV or Excel file into a DataFrame."""
     try:
 def main():
     st.title("Merge Employee Name from Earnings into PAYE Sheet")
+    st.write(
+        "Upload an Earnings Sheet and a PAYE Sheet. The app will extract the first two columns "
+        "(TIN and Employee Name) from the Earnings Sheet and merge the Employee Name into the PAYE sheet based on matching TIN values."
+    )
     earnings_file = st.file_uploader("Upload Earnings Sheet", type=["csv", "xlsx", "xls"], key="earnings")
     paye_file = st.file_uploader("Upload PAYE Sheet", type=["csv", "xlsx", "xls"], key="paye")
     if earnings_file and paye_file:
+        # Read the files
         earnings_df = read_file(earnings_file)
         paye_df = read_file(paye_file)
             st.error("One of the files could not be read. Please check the files and try again.")
             return
+        # Clean and standardize column names and values
         earnings_df.columns = [clean_column_name(col) for col in earnings_df.columns]
+        earnings_df = standardize_dataframe(earnings_df)
         paye_df.columns = [clean_column_name(col) for col in paye_df.columns]
+        paye_df = standardize_dataframe(paye_df)
+        # Ensure earnings file has at least two columns
         if earnings_df.shape[1] < 2:
             st.error("Earnings sheet must have at least two columns (TIN and Employee Name).")
             return
+        # Extract first two columns from the earnings sheet.
+        # Assume the first column is TIN and the second is Employee Name.
         earnings_subset = earnings_df.iloc[:, :2].copy()
         earnings_subset.columns = ["tin", "employee_name"]
+        # Ensure trailing spaces are removed
+        earnings_subset["tin"] = earnings_subset["tin"].astype(str).str.strip()
+        earnings_subset["employee_name"] = earnings_subset["employee_name"].astype(str).str.strip()
         st.write("Preview of extracted TIN and Employee Name from Earnings Sheet:")
         st.dataframe(safe_display_df(earnings_subset.head()))
+        # Check for the 'tin' column in the PAYE sheet.
         if "tin" not in paye_df.columns:
+            st.error("The PAYE sheet does not have a recognized TIN column (e.g., 'tin' or 'personal_id_of_employee').")
             return
+        else:
+            # Ensure trailing spaces are removed from PAYE tin values.
+            paye_df["tin"] = paye_df["tin"].astype(str).str.strip()
+        # Merge the PAYE sheet with the earnings subset on the 'tin' column.
         merged_df = paye_df.merge(earnings_subset, on="tin", how="left")
         st.write("### Merged PAYE Sheet with Employee Name")
         st.dataframe(safe_display_df(merged_df.head()))
+        # Option to download the merged data as CSV.
         csv_data = merged_df.to_csv(index=False).encode("utf-8")
         st.download_button(
             label="Download Merged CSV",
             file_name="merged_paye.csv",
             mime="text/csv"
         )
         st.write(f"Total rows in merged data: {len(merged_df)}")
     else:
         st.info("Please upload both an Earnings Sheet and a PAYE Sheet.")