Spaces:

rairo
/

OneExcelZimra

Sleeping

App Files Files Community

rairo commited on Feb 11, 2025

Commit

e19ef0e

verified ·

1 Parent(s): 412766b

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -113

app.py CHANGED Viewed

@@ -2,153 +2,140 @@ import streamlit as st
 import pandas as pd
 from io import BytesIO
 import numpy as np
-def normalize_column_name(col: str) -> str:
-    """Standardize column names by removing extra spaces and newlines."""
-    if not isinstance(col, str):
-        return str(col)
-    return " ".join(col.strip().replace("\n", " ").split())
-def split_currency_columns(df: pd.DataFrame) -> pd.DataFrame:
-    """Split columns with USD/ZWL values into separate columns."""
-    currency_columns = []
-    for col in df.columns:
-        if isinstance(col, str) and ('USD' in col.upper() or 'ZWL' in col.upper() or 'ZWG' in col.upper()):
-            currency_columns.append(col)
-    for col in currency_columns:
-        base_name = col.replace('USD', '').replace('ZWL', '').replace('ZWG', '').strip()
-        if 'USD' in col.upper():
-            df.rename(columns={col: f"{base_name} USD"}, inplace=True)
-        elif 'ZWL' in col.upper() or 'ZWG' in col.upper():
-            df.rename(columns={col: f"{base_name} ZWL"}, inplace=True)
-    return df
-def clean_tin(tin: str) -> str:
-    """Clean and standardize TIN format."""
     if pd.isna(tin):
         return ""
     tin = str(tin).strip()
-    # Remove extra spaces and standardize format
-    return " ".join(tin.split())
-def clean_name(name: str) -> str:
     """Clean and standardize name format."""
     if pd.isna(name):
         return ""
-    name = str(name).strip()
-    # Convert to uppercase and remove extra spaces
-    return " ".join(name.upper().split())
-def process_dataframe(df: pd.DataFrame, file_name: str) -> pd.DataFrame:
-    """Process each uploaded file to ensure consistent format."""
-    st.write(f"Processing file: **{file_name}**")
-    # Normalize column names
-    df.columns = [normalize_column_name(col) for col in df.columns]
-    # Handle employee identification
-    if "TIN or Personal ID of Employee" in df.columns:
-        df.rename(columns={"TIN or Personal ID of Employee": "TIN"}, inplace=True)
-    elif "Personal ID of Employee" in df.columns:
-        df.rename(columns={"Personal ID of Employee": "TIN"}, inplace=True)
-    # Create Employee Name if not present
-    if "Employee Name" not in df.columns and "First Name" in df.columns and "Last Name" in df.columns:
-        df["Employee Name"] = df["First Name"].fillna("") + " " + df["Last Name"].fillna("")
-    # Clean TIN and Employee Name
-    if "TIN" in df.columns:
-        df["TIN"] = df["TIN"].apply(clean_tin)
-    if "Employee Name" in df.columns:
-        df["Employee Name"] = df["Employee Name"].apply(clean_name)
-    # Split currency columns
-    df = split_currency_columns(df)
-    # Remove any completely empty rows
-    df = df.dropna(how='all')
     return df
-def merge_dataframes(dfs: list) -> pd.DataFrame:
-    """Merge all DataFrames using TIN and Employee Name as keys."""
-    if not dfs:
-        return pd.DataFrame()
-    # Start with the first DataFrame
-    master_df = dfs[0]
-    # Merge with subsequent DataFrames
-    for df in dfs[1:]:
-        # Ensure key columns exist
-        for col in ["TIN", "Employee Name"]:
-            if col not in master_df.columns:
-                master_df[col] = ""
-            if col not in df.columns:
-                df[col] = ""
-        # Clean keys before merging
-        master_df["TIN"] = master_df["TIN"].apply(clean_tin)
-        master_df["Employee Name"] = master_df["Employee Name"].apply(clean_name)
-        df["TIN"] = df["TIN"].apply(clean_tin)
-        df["Employee Name"] = df["Employee Name"].apply(clean_name)
-        # Merge using both TIN and Employee Name
-        master_df = pd.merge(
-            master_df, df,
-            on=["TIN", "Employee Name"],
-            how="outer",
-            suffixes=("", "_drop")
-        )
-        # Remove duplicate columns
-        drop_cols = [col for col in master_df.columns if col.endswith('_drop')]
-        master_df.drop(columns=drop_cols, inplace=True)
-    # Final cleanup
-    master_df = master_df.replace({np.nan: "", None: ""})
-    return master_df
 def main():
-    st.title("Enhanced Payroll Data Processor")
     st.write("""
-    Upload your payroll data files. The system will:
-    1. Standardize employee identification using TIN and Employee Name
-    2. Handle both USD and ZWL currency columns
-    3. Merge all data into a comprehensive master sheet
     """)
-    uploaded_files = st.file_uploader(
-        "Upload payroll data files",
-        type=["xlsx", "xls"],
-        accept_multiple_files=True
-    )
-    if uploaded_files:
-        processed_dfs = []
-        for file in uploaded_files:
-            try:
-                df = pd.read_excel(file)
-                df = process_dataframe(df, file.name)
-                processed_dfs.append(df)
-                st.write(f"Successfully processed {file.name}")
-            except Exception as e:
-                st.error(f"Error processing {file.name}: {str(e)}")
-                return
-        if processed_dfs:
-            master_df = merge_dataframes(processed_dfs)
             st.subheader("Master Payroll Data Preview")
-            st.dataframe(master_df)
             # Prepare download
             output = BytesIO()
             with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
-                master_df.to_excel(writer, index=False, sheet_name='Master Payroll')
             st.download_button(
                 label="Download Master Payroll Excel",
@@ -156,6 +143,9 @@ def main():
                 file_name="master_payroll.xlsx",
                 mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
 if __name__ == "__main__":
     main()

 import pandas as pd
 from io import BytesIO
 import numpy as np
+import re
+def standardize_tin(tin):
+    """Standardize TIN format by removing extra spaces and standardizing format."""
     if pd.isna(tin):
         return ""
     tin = str(tin).strip()
+    # Remove all spaces and standardize
+    tin = re.sub(r'\s+', '', tin)
+    # Add spaces back in standard format if it matches the pattern
+    if re.match(r'^\d{2}-?\d{6}[A-Z]\d{2}$', tin):
+        return f"{tin[:2]}-{tin[2:8]} {tin[8]} {tin[9:11]}"
+    return tin
+def clean_name(name):
     """Clean and standardize name format."""
     if pd.isna(name):
         return ""
+    return " ".join(str(name).upper().strip().split())
+def process_employee_data(df):
+    """Process employee personal information."""
+    # Standardize column names
+    df.columns = [col.strip() for col in df.columns]
+    # Extract employee details
+    required_columns = [
+        'TIN', 'First Name', 'Middle Name', 'Last Name',
+        'Birth Date', 'Employed From date', 'Employed To date', 'Position'
+    ]
+    # Create Employee Name
+    if 'First Name' in df.columns and 'Last Name' in df.columns:
+        df['Employee Name'] = df.apply(
+            lambda x: f"{clean_name(x['First Name'])} {clean_name(x['Last Name'])}",
+            axis=1
+        )
+    # Clean TIN
+    if 'TIN' in df.columns or 'Personal ID of Employee' in df.columns:
+        tin_col = 'TIN' if 'TIN' in df.columns else 'Personal ID of Employee'
+        df['TIN'] = df[tin_col].apply(standardize_tin)
+    return df
+def process_salary_data(df):
+    """Process salary and deductions data."""
+    # Standardize column names
+    df.columns = [col.strip() for col in df.columns]
+    # Clean TIN column if present
+    if 'TIN' in df.columns or 'TIN or Personal ID of Employee' in df.columns:
+        tin_col = 'TIN' if 'TIN' in df.columns else 'TIN or Personal ID of Employee'
+        df['TIN'] = df[tin_col].apply(standardize_tin)
+    # Convert numeric columns
+    numeric_columns = df.select_dtypes(include=[np.number]).columns
+    for col in numeric_columns:
+        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
     return df
+def merge_dataframes(employee_df, salary_df):
+    """Merge employee and salary information."""
+    # Ensure TIN columns are standardized
+    employee_df['TIN'] = employee_df['TIN'].apply(standardize_tin)
+    salary_df['TIN'] = salary_df['TIN'].apply(standardize_tin)
+    # Merge on TIN
+    merged_df = pd.merge(
+        employee_df,
+        salary_df,
+        on='TIN',
+        how='outer',
+        suffixes=('', '_y')
+    )
+    # Drop duplicate columns
+    duplicate_cols = [col for col in merged_df.columns if col.endswith('_y')]
+    merged_df.drop(columns=duplicate_cols, inplace=True)
+    # Fill missing numeric values with 0
+    numeric_columns = merged_df.select_dtypes(include=[np.number]).columns
+    merged_df[numeric_columns] = merged_df[numeric_columns].fillna(0)
+    return merged_df
 def main():
+    st.title("Payroll Data Processor")
     st.write("""
+    Upload:
+    1. Employee Information File (with personal details)
+    2. Salary Information File (with financial data)
     """)
+    employee_file = st.file_uploader("Upload Employee Information", type=['xlsx', 'xls'])
+    salary_file = st.file_uploader("Upload Salary Information", type=['xlsx', 'xls'])
+    if employee_file and salary_file:
+        try:
+            # Process employee data
+            employee_df = pd.read_excel(employee_file)
+            employee_df = process_employee_data(employee_df)
+            st.write("Employee data processed successfully")
+            # Process salary data
+            salary_df = pd.read_excel(salary_file)
+            salary_df = process_salary_data(salary_df)
+            st.write("Salary data processed successfully")
+            # Merge the dataframes
+            final_df = merge_dataframes(employee_df, salary_df)
+            # Organize columns in desired order
+            column_order = [
+                'TIN', 'Employee Name', 'First Name', 'Middle Name', 'Last Name',
+                'Birth Date', 'Employed From date', 'Employed To date', 'Position'
+            ]
+            # Add remaining columns in their original order
+            remaining_cols = [col for col in final_df.columns if col not in column_order]
+            column_order.extend(remaining_cols)
+            # Reorder columns
+            final_df = final_df[column_order]
             st.subheader("Master Payroll Data Preview")
+            st.dataframe(final_df)
             # Prepare download
             output = BytesIO()
             with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+                final_df.to_excel(writer, index=False, sheet_name='Master Payroll')
             st.download_button(
                 label="Download Master Payroll Excel",
                 file_name="master_payroll.xlsx",
                 mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
+        except Exception as e:
+            st.error(f"Error processing files: {str(e)}")
 if __name__ == "__main__":
     main()