Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

d05ef69

verified ·

1 Parent(s): d6d231e

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -17

app.py CHANGED Viewed

@@ -23,8 +23,7 @@ def clean_column_name(col_name):
 def clean_tin_value(val):
     """
-    Clean the TIN value by stripping whitespace and,
-    if it ends with '.0', converting it to an integer string.
     """
     val_str = str(val).strip()
     if val_str.endswith('.0'):
@@ -38,24 +37,22 @@ def clean_tin_value(val):
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Standardize DataFrame column names and data types.
-    - Renames synonyms to common names (e.g., tin, salary).
-    - Creates an employee_name column if missing but first_name and last_name exist.
     - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
-    - Cleans the key columns 'tin' and 'employee_name' for consistency.
     """
     rename_map = {}
-    # Standardize TIN-related columns
     for col in df.columns:
         col_lower = col.lower()
-        if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid']:
             rename_map[col] = 'tin'
         elif 'tin' in col_lower:
             rename_map[col] = 'tin'
-    # Standardize salary columns
-    for col in df.columns:
-        col_lower = col.lower()
         if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
             rename_map[col] = 'salary'
@@ -78,16 +75,15 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
-    # Ensure salary column is numeric (to avoid conversion errors)
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
-    # Clean key columns:
     if 'tin' in df.columns:
-        # First, cast to string then clean individual values
-        df['tin'] = df['tin'].astype(str).apply(clean_tin_value)
     if 'employee_name' in df.columns:
-        df['employee_name'] = df['employee_name'].astype(str).str.strip()
     return df
@@ -196,7 +192,6 @@ def merge_with_master(processed_files):
     master_df = master_file["df"]
     st.write(f"Using '{master_file['filename']}' as master for merging.")
-    # Use both 'tin' and 'employee_name' if available, else fallback to common columns.
     default_keys = ['tin', 'employee_name']
     merged_df = master_df

 def clean_tin_value(val):
     """
+    Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
     """
     val_str = str(val).strip()
     if val_str.endswith('.0'):
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Standardize DataFrame column names and data types.
+    - Renames synonyms to common names (e.g., 'tin', 'salary').
+    - In particular, any header containing 'personal_id_of_employee' (or similar) or 'tin' is renamed to 'tin'.
+    - Creates an 'employee_name' column if missing but first_name and last_name exist.
     - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
+    - Forces the key columns 'tin' and 'employee_name' to be strings.
     """
     rename_map = {}
     for col in df.columns:
         col_lower = col.lower()
+        # Rename headers to 'tin'
+        if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
             rename_map[col] = 'tin'
         elif 'tin' in col_lower:
             rename_map[col] = 'tin'
+        # Rename headers to 'salary'
         if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
             rename_map[col] = 'salary'
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
+    # Ensure salary column is numeric (to avoid conversion errors later)
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
+    # Force key columns to be strings, filling NaNs with empty strings
     if 'tin' in df.columns:
+        df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
     if 'employee_name' in df.columns:
+        df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
     return df
     master_df = master_file["df"]
     st.write(f"Using '{master_file['filename']}' as master for merging.")
     default_keys = ['tin', 'employee_name']
     merged_df = master_df