Spaces:
Sleeping
Sleeping
File size: 5,189 Bytes
90c52bb e19ef0e 90c52bb 96d2597 412766b a35b8e4 42db88a a35b8e4 0c23633 42db88a a35b8e4 42db88a a35b8e4 42db88a 0c23633 42db88a 0c23633 96d2597 0c23633 96d2597 0c23633 96d2597 90c52bb 96d2597 a35b8e4 96d2597 c484caf 96d2597 42db88a 0c23633 680d9f2 42db88a a03e4cd 96d2597 a03e4cd 96d2597 0c23633 96d2597 9f460af 0c23633 a35b8e4 1fc5859 0c23633 96d2597 e19ef0e 0c23633 96d2597 42db88a 0c23633 96d2597 955b70b 680d9f2 96d2597 a35b8e4 96d2597 42db88a 90c52bb 0c23633 96d2597 b8e6e48 42db88a 96d2597 90c52bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import streamlit as st
import pandas as pd
import re
def clean_column_name(col_name):
"""Clean column names by replacing non-word characters with underscores."""
if not isinstance(col_name, str):
return str(col_name)
cleaned = re.sub(r"[^\w\s]", " ", col_name)
return re.sub(r"\s+", "_", cleaned.strip().lower())
def standardize_tin_column(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean column names and rename any column that contains 'tin'
or both 'personal' and 'id' to 'tin'. Then strip extra spaces from all string values.
"""
df.columns = [clean_column_name(col) for col in df.columns]
rename_map = {}
for col in df.columns:
col_lower = col.lower()
if "tin" in col_lower or (("personal" in col_lower) and ("id" in col_lower)):
rename_map[col] = "tin"
if rename_map:
df = df.rename(columns=rename_map)
# Remove trailing and leading spaces in string cells
for col in df.columns:
if df[col].dtype == object:
df[col] = df[col].astype(str).str.strip()
return df
def read_file(file, skip_first_row=False) -> pd.DataFrame:
"""
Read a CSV or Excel file into a DataFrame.
For the earnings file, skip_first_row=True will skip the first row (with currency labels).
"""
try:
if file.name.endswith((".xlsx", ".xls")):
return pd.read_excel(file, skiprows=1 if skip_first_row else None)
else:
return pd.read_csv(file, skiprows=1 if skip_first_row else None)
except Exception as e:
st.error(f"Error reading {file.name}: {str(e)}")
return None
def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
"""Convert DataFrame values to strings for safe display."""
return df.astype(str).replace({"nan": "", "None": ""})
def main():
st.title("Merge Employee Name from Earnings into PAYE Sheet")
st.write(
"Upload an Earnings Sheet and a PAYE Sheet. The Earnings Sheet is assumed to have a first row with currency labels "
"which will be skipped. The app will extract the first two columns (TIN and Employee Name) from the Earnings Sheet, "
"and merge the Employee Name onto the PAYE sheet using the cleaned TIN."
)
earnings_file = st.file_uploader("Upload Earnings Sheet", type=["csv", "xlsx", "xls"], key="earnings")
paye_file = st.file_uploader("Upload PAYE Sheet", type=["csv", "xlsx", "xls"], key="paye")
if earnings_file and paye_file:
# Read the earnings file with the first row skipped and the PAYE file normally.
earnings_df = read_file(earnings_file, skip_first_row=True)
paye_df = read_file(paye_file, skip_first_row=False)
if earnings_df is None or paye_df is None:
st.error("One of the files could not be read. Please check the files and try again.")
return
# Standardize columns and TIN values for both files.
earnings_df = standardize_tin_column(earnings_df)
paye_df = standardize_tin_column(paye_df)
# Debug: display unique TIN values from both files
st.write("Unique TIN values in Earnings file:", earnings_df.iloc[:, 0].unique())
if "tin" in paye_df.columns:
st.write("Unique TIN values in PAYE file:", paye_df["tin"].unique())
else:
st.write("PAYE file columns:", list(paye_df.columns))
# Check that the earnings file has at least two columns.
if earnings_df.shape[1] < 2:
st.error("Earnings sheet must have at least two columns (TIN and Employee Name).")
return
# Extract the first two columns from the earnings file.
earnings_subset = earnings_df.iloc[:, :2].copy()
earnings_subset.columns = ["tin", "employee_name"]
earnings_subset["tin"] = earnings_subset["tin"].astype(str).str.strip()
earnings_subset["employee_name"] = earnings_subset["employee_name"].astype(str).str.strip()
st.write("Preview of extracted TIN and Employee Name from Earnings Sheet:")
st.dataframe(safe_display_df(earnings_subset.head()))
# Verify the PAYE sheet has a 'tin' column.
if "tin" not in paye_df.columns:
st.error("The PAYE sheet does not have a recognized TIN column (e.g., 'tin' or 'personal id').")
return
else:
paye_df["tin"] = paye_df["tin"].astype(str).str.strip()
# Merge the employee name from earnings_subset onto the PAYE sheet using 'tin'.
merged_df = paye_df.merge(earnings_subset, on="tin", how="left")
st.write("### Merged PAYE Sheet with Employee Name")
st.dataframe(safe_display_df(merged_df.head()))
# Option to download the merged data as CSV.
csv_data = merged_df.to_csv(index=False).encode("utf-8")
st.download_button(
label="Download Merged CSV",
data=csv_data,
file_name="merged_paye.csv",
mime="text/csv"
)
st.write(f"Total rows in merged data: {len(merged_df)}")
else:
st.info("Please upload both an Earnings Sheet and a PAYE Sheet.")
if __name__ == "__main__":
main() |