Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,8 +23,7 @@ def clean_column_name(col_name):
|
|
| 23 |
|
| 24 |
def clean_tin_value(val):
|
| 25 |
"""
|
| 26 |
-
Clean the TIN value by stripping whitespace and,
|
| 27 |
-
if it ends with '.0', converting it to an integer string.
|
| 28 |
"""
|
| 29 |
val_str = str(val).strip()
|
| 30 |
if val_str.endswith('.0'):
|
|
@@ -38,24 +37,22 @@ def clean_tin_value(val):
|
|
| 38 |
def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 39 |
"""
|
| 40 |
Standardize DataFrame column names and data types.
|
| 41 |
-
- Renames synonyms to common names (e.g., tin, salary).
|
| 42 |
-
-
|
|
|
|
| 43 |
- Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
|
| 44 |
-
-
|
| 45 |
"""
|
| 46 |
rename_map = {}
|
| 47 |
|
| 48 |
-
# Standardize TIN-related columns
|
| 49 |
for col in df.columns:
|
| 50 |
col_lower = col.lower()
|
| 51 |
-
|
|
|
|
| 52 |
rename_map[col] = 'tin'
|
| 53 |
elif 'tin' in col_lower:
|
| 54 |
rename_map[col] = 'tin'
|
| 55 |
-
|
| 56 |
-
# Standardize salary columns
|
| 57 |
-
for col in df.columns:
|
| 58 |
-
col_lower = col.lower()
|
| 59 |
if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
|
| 60 |
rename_map[col] = 'salary'
|
| 61 |
|
|
@@ -78,16 +75,15 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 78 |
if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
|
| 79 |
df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
|
| 80 |
|
| 81 |
-
# Ensure salary column is numeric (to avoid conversion errors)
|
| 82 |
if 'salary' in df.columns:
|
| 83 |
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
|
| 84 |
|
| 85 |
-
#
|
| 86 |
if 'tin' in df.columns:
|
| 87 |
-
|
| 88 |
-
df['tin'] = df['tin'].astype(str).apply(clean_tin_value)
|
| 89 |
if 'employee_name' in df.columns:
|
| 90 |
-
df['employee_name'] = df['employee_name'].astype(str).str.strip()
|
| 91 |
|
| 92 |
return df
|
| 93 |
|
|
@@ -196,7 +192,6 @@ def merge_with_master(processed_files):
|
|
| 196 |
master_df = master_file["df"]
|
| 197 |
st.write(f"Using '{master_file['filename']}' as master for merging.")
|
| 198 |
|
| 199 |
-
# Use both 'tin' and 'employee_name' if available, else fallback to common columns.
|
| 200 |
default_keys = ['tin', 'employee_name']
|
| 201 |
merged_df = master_df
|
| 202 |
|
|
|
|
| 23 |
|
| 24 |
def clean_tin_value(val):
|
| 25 |
"""
|
| 26 |
+
Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
|
|
|
|
| 27 |
"""
|
| 28 |
val_str = str(val).strip()
|
| 29 |
if val_str.endswith('.0'):
|
|
|
|
| 37 |
def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 38 |
"""
|
| 39 |
Standardize DataFrame column names and data types.
|
| 40 |
+
- Renames synonyms to common names (e.g., 'tin', 'salary').
|
| 41 |
+
- In particular, any header containing 'personal_id_of_employee' (or similar) or 'tin' is renamed to 'tin'.
|
| 42 |
+
- Creates an 'employee_name' column if missing but first_name and last_name exist.
|
| 43 |
- Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
|
| 44 |
+
- Forces the key columns 'tin' and 'employee_name' to be strings.
|
| 45 |
"""
|
| 46 |
rename_map = {}
|
| 47 |
|
|
|
|
| 48 |
for col in df.columns:
|
| 49 |
col_lower = col.lower()
|
| 50 |
+
# Rename headers to 'tin'
|
| 51 |
+
if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
|
| 52 |
rename_map[col] = 'tin'
|
| 53 |
elif 'tin' in col_lower:
|
| 54 |
rename_map[col] = 'tin'
|
| 55 |
+
# Rename headers to 'salary'
|
|
|
|
|
|
|
|
|
|
| 56 |
if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
|
| 57 |
rename_map[col] = 'salary'
|
| 58 |
|
|
|
|
| 75 |
if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
|
| 76 |
df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
|
| 77 |
|
| 78 |
+
# Ensure salary column is numeric (to avoid conversion errors later)
|
| 79 |
if 'salary' in df.columns:
|
| 80 |
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
|
| 81 |
|
| 82 |
+
# Force key columns to be strings, filling NaNs with empty strings
|
| 83 |
if 'tin' in df.columns:
|
| 84 |
+
df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
|
|
|
|
| 85 |
if 'employee_name' in df.columns:
|
| 86 |
+
df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
|
| 87 |
|
| 88 |
return df
|
| 89 |
|
|
|
|
| 192 |
master_df = master_file["df"]
|
| 193 |
st.write(f"Using '{master_file['filename']}' as master for merging.")
|
| 194 |
|
|
|
|
| 195 |
default_keys = ['tin', 'employee_name']
|
| 196 |
merged_df = master_df
|
| 197 |
|