Spaces:

rairo
/

OneExcelZimra

Sleeping

App Files Files Community

rairo commited on Feb 21, 2025

Commit

a35b8e4

verified ·

1 Parent(s): 42db88a

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -29

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import streamlit as st
 import pandas as pd
-import os
-from io import BytesIO
 import re
 def clean_column_name(col_name):
@@ -11,24 +9,20 @@ def clean_column_name(col_name):
     cleaned = re.sub(r"[^\w\s]", " ", col_name)
     return re.sub(r"\s+", "_", cleaned.strip().lower())
-def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Standardize DataFrame column names.
-    If a column name contains "tin" or variants of personal id,
-    rename it to 'tin'. Also, strip trailing spaces from all string values.
     """
     rename_map = {}
     for col in df.columns:
         col_lower = col.lower()
-        # Check for various forms of TIN column name.
-        if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
-            rename_map[col] = 'tin'
-        elif 'tin' in col_lower:
-            rename_map[col] = 'tin'
     if rename_map:
         df = df.rename(columns=rename_map)
-    # Strip trailing spaces from string values in every column.
     for col in df.columns:
         if df[col].dtype == object:
             df[col] = df[col].astype(str).str.strip()
@@ -46,14 +40,15 @@ def read_file(file) -> pd.DataFrame:
         return None
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
-    """Convert DataFrame to string values to ensure safe display."""
     return df.astype(str).replace({"nan": "", "None": ""})
 def main():
     st.title("Merge Employee Name from Earnings into PAYE Sheet")
     st.write(
-        "Upload an Earnings Sheet and a PAYE Sheet. The app will extract the first two columns "
-        "(TIN and Employee Name) from the Earnings Sheet and merge the Employee Name into the PAYE sheet based on matching TIN values."
     )
     earnings_file = st.file_uploader("Upload Earnings Sheet", type=["csv", "xlsx", "xls"], key="earnings")
@@ -68,37 +63,34 @@ def main():
             st.error("One of the files could not be read. Please check the files and try again.")
             return
-        # Clean and standardize column names and values
-        earnings_df.columns = [clean_column_name(col) for col in earnings_df.columns]
-        earnings_df = standardize_dataframe(earnings_df)
-        paye_df.columns = [clean_column_name(col) for col in paye_df.columns]
-        paye_df = standardize_dataframe(paye_df)
-        # Ensure earnings file has at least two columns
         if earnings_df.shape[1] < 2:
             st.error("Earnings sheet must have at least two columns (TIN and Employee Name).")
             return
-        # Extract first two columns from the earnings sheet.
-        # Assume the first column is TIN and the second is Employee Name.
         earnings_subset = earnings_df.iloc[:, :2].copy()
         earnings_subset.columns = ["tin", "employee_name"]
-        # Ensure trailing spaces are removed
         earnings_subset["tin"] = earnings_subset["tin"].astype(str).str.strip()
         earnings_subset["employee_name"] = earnings_subset["employee_name"].astype(str).str.strip()
         st.write("Preview of extracted TIN and Employee Name from Earnings Sheet:")
         st.dataframe(safe_display_df(earnings_subset.head()))
-        # Check for the 'tin' column in the PAYE sheet.
         if "tin" not in paye_df.columns:
-            st.error("The PAYE sheet does not have a recognized TIN column (e.g., 'tin' or 'personal_id_of_employee').")
             return
         else:
-            # Ensure trailing spaces are removed from PAYE tin values.
             paye_df["tin"] = paye_df["tin"].astype(str).str.strip()
-        # Merge the PAYE sheet with the earnings subset on the 'tin' column.
         merged_df = paye_df.merge(earnings_subset, on="tin", how="left")
         st.write("### Merged PAYE Sheet with Employee Name")
         st.dataframe(safe_display_df(merged_df.head()))

 import streamlit as st
 import pandas as pd
 import re
 def clean_column_name(col_name):
     cleaned = re.sub(r"[^\w\s]", " ", col_name)
     return re.sub(r"\s+", "_", cleaned.strip().lower())
+def standardize_tin_column(df: pd.DataFrame) -> pd.DataFrame:
     """
+    Clean column names and rename any column that contains 'tin'
+    or both 'personal' and 'id' to 'tin'. Then strip extra spaces.
     """
+    df.columns = [clean_column_name(col) for col in df.columns]
     rename_map = {}
     for col in df.columns:
         col_lower = col.lower()
+        if "tin" in col_lower or (("personal" in col_lower) and ("id" in col_lower)):
+            rename_map[col] = "tin"
     if rename_map:
         df = df.rename(columns=rename_map)
+    # Strip trailing spaces from string columns
     for col in df.columns:
         if df[col].dtype == object:
             df[col] = df[col].astype(str).str.strip()
         return None
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert DataFrame values to strings for safe display."""
     return df.astype(str).replace({"nan": "", "None": ""})
 def main():
     st.title("Merge Employee Name from Earnings into PAYE Sheet")
     st.write(
+        "Upload an Earnings Sheet and a PAYE Sheet. "
+        "The app will extract the first two columns (TIN and Employee Name) from the Earnings Sheet, "
+        "clean and standardize the TIN values, and then merge the Employee Name onto the PAYE sheet using the TIN."
     )
     earnings_file = st.file_uploader("Upload Earnings Sheet", type=["csv", "xlsx", "xls"], key="earnings")
             st.error("One of the files could not be read. Please check the files and try again.")
             return
+        # Standardize columns for both files
+        earnings_df = standardize_tin_column(earnings_df)
+        paye_df = standardize_tin_column(paye_df)
+        # Check that the earnings file has at least two columns
         if earnings_df.shape[1] < 2:
             st.error("Earnings sheet must have at least two columns (TIN and Employee Name).")
             return
+        # Extract first two columns from earnings file.
+        # Assume first column is TIN and second is Employee Name.
         earnings_subset = earnings_df.iloc[:, :2].copy()
         earnings_subset.columns = ["tin", "employee_name"]
+        # Ensure values are stripped of trailing spaces
         earnings_subset["tin"] = earnings_subset["tin"].astype(str).str.strip()
         earnings_subset["employee_name"] = earnings_subset["employee_name"].astype(str).str.strip()
         st.write("Preview of extracted TIN and Employee Name from Earnings Sheet:")
         st.dataframe(safe_display_df(earnings_subset.head()))
+        # Verify the PAYE sheet has a 'tin' column
         if "tin" not in paye_df.columns:
+            st.error("The PAYE sheet does not have a recognized TIN column (e.g., 'tin' or 'personal id').")
             return
         else:
             paye_df["tin"] = paye_df["tin"].astype(str).str.strip()
+        # Merge the employee name from earnings_subset onto the PAYE sheet using 'tin'
         merged_df = paye_df.merge(earnings_subset, on="tin", how="left")
         st.write("### Merged PAYE Sheet with Employee Name")
         st.dataframe(safe_display_df(merged_df.head()))