Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

2076af3

verified ·

1 Parent(s): 5e8d055

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -7

app.py CHANGED Viewed

@@ -27,6 +27,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     - Renames synonyms to common names (e.g., tin, salary).
     - Creates an employee_name column if missing but first_name and last_name exist.
     - Converts the salary column to numeric.
     """
     rename_map = {}
@@ -38,7 +39,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         elif 'tin' in col_lower:
             rename_map[col] = 'tin'
-    # Standardize salary columns (e.g., current_salary_wages_fees_commissions_etc_regular_earnings)
     for col in df.columns:
         col_lower = col.lower()
         if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
@@ -47,11 +48,24 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     if rename_map:
         df = df.rename(columns=rename_map)
     # Create employee_name if not present but first_name and last_name exist
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
-    # Ensure salary column is numeric (to avoid pyarrow conversion errors)
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
@@ -148,7 +162,6 @@ def merge_with_master(processed_files):
     master_file = None
     other_files = []
-    # Identify the master file by checking for 'earnings' in the filename
     for file_info in processed_files:
         if "earnings" in file_info["filename"].lower():
             master_file = file_info
@@ -163,16 +176,13 @@ def merge_with_master(processed_files):
     master_df = master_file["df"]
     st.write(f"Using '{master_file['filename']}' as master for merging.")
-    # Define default key columns for merging
     default_keys = ['tin', 'employee_name']
     merged_df = master_df
     for other in other_files:
         other_df = other["df"]
-        # Try to use default keys if they exist in both
         keys_to_use = [key for key in default_keys if key in other_df.columns and key in merged_df.columns]
         if not keys_to_use:
-            # Fallback: use intersection of columns if default keys aren't found
             keys_to_use = list(set(merged_df.columns).intersection(set(other_df.columns)))
         if keys_to_use:
             st.write(f"Merging '{other['filename']}' on keys: {keys_to_use}")
@@ -210,7 +220,6 @@ def main():
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
-                    # Clean and standardize column names and data types
                     df.columns = [clean_column_name(col) for col in df.columns]
                     df = standardize_dataframe(df)

     - Renames synonyms to common names (e.g., tin, salary).
     - Creates an employee_name column if missing but first_name and last_name exist.
     - Converts the salary column to numeric.
+    - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
     """
     rename_map = {}
         elif 'tin' in col_lower:
             rename_map[col] = 'tin'
+    # Standardize salary columns
     for col in df.columns:
         col_lower = col.lower()
         if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
     if rename_map:
         df = df.rename(columns=rename_map)
+    # Combine duplicate columns for 'salary'
+    if 'salary' in df.columns and list(df.columns).count('salary') > 1:
+        salary_cols = [col for col in df.columns if col == 'salary']
+        # Use backfill across the duplicate columns and take the first non-null value
+        df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
+        df = df.loc[:, ~df.columns.duplicated()]
+    # Combine duplicate columns for 'tin'
+    if 'tin' in df.columns and list(df.columns).count('tin') > 1:
+        tin_cols = [col for col in df.columns if col == 'tin']
+        df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
+        df = df.loc[:, ~df.columns.duplicated()]
     # Create employee_name if not present but first_name and last_name exist
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
+    # Ensure salary column is numeric (to avoid conversion errors)
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
     master_file = None
     other_files = []
     for file_info in processed_files:
         if "earnings" in file_info["filename"].lower():
             master_file = file_info
     master_df = master_file["df"]
     st.write(f"Using '{master_file['filename']}' as master for merging.")
     default_keys = ['tin', 'employee_name']
     merged_df = master_df
     for other in other_files:
         other_df = other["df"]
         keys_to_use = [key for key in default_keys if key in other_df.columns and key in merged_df.columns]
         if not keys_to_use:
             keys_to_use = list(set(merged_df.columns).intersection(set(other_df.columns)))
         if keys_to_use:
             st.write(f"Merging '{other['filename']}' on keys: {keys_to_use}")
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
                     df.columns = [clean_column_name(col) for col in df.columns]
                     df = standardize_dataframe(df)