Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -39,33 +39,35 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 39 |
"""
|
| 40 |
Standardize DataFrame column names and data types:
|
| 41 |
- Drops any middle name columns.
|
| 42 |
-
- Cleans all column names (e.g., "Employee Name"
|
| 43 |
-
- Renames synonyms to common names (e.g., 'tin'
|
|
|
|
| 44 |
- Creates an 'employee_name' column if missing but first_name and last_name exist.
|
| 45 |
- Combines duplicate key columns into one.
|
| 46 |
-
- Forces
|
| 47 |
"""
|
| 48 |
# Drop any column that appears to be a middle name
|
| 49 |
middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
|
| 50 |
if middle_name_cols:
|
| 51 |
df = df.drop(columns=middle_name_cols)
|
| 52 |
|
| 53 |
-
# Clean all column names
|
| 54 |
df.columns = [clean_column_name(col) for col in df.columns]
|
| 55 |
|
| 56 |
-
#
|
|
|
|
| 57 |
rename_map = {}
|
| 58 |
for col in df.columns:
|
| 59 |
if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
|
| 60 |
rename_map[col] = 'tin'
|
| 61 |
-
elif 'tin' in col:
|
| 62 |
rename_map[col] = 'tin'
|
| 63 |
if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
|
| 64 |
rename_map[col] = 'salary'
|
| 65 |
if rename_map:
|
| 66 |
df = df.rename(columns=rename_map)
|
| 67 |
|
| 68 |
-
# Combine duplicate columns
|
| 69 |
if 'salary' in df.columns and list(df.columns).count('salary') > 1:
|
| 70 |
salary_cols = [col for col in df.columns if col == 'salary']
|
| 71 |
df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
|
|
@@ -75,11 +77,11 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 75 |
df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
|
| 76 |
df = df.loc[:, ~df.columns.duplicated()]
|
| 77 |
|
| 78 |
-
# If employee_name is missing
|
| 79 |
if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
|
| 80 |
df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
|
| 81 |
|
| 82 |
-
# Ensure key columns are of the
|
| 83 |
if 'salary' in df.columns:
|
| 84 |
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
|
| 85 |
if 'tin' in df.columns:
|
|
@@ -179,15 +181,16 @@ def read_excel_file(file) -> pd.DataFrame:
|
|
| 179 |
def merge_with_master(processed_files):
|
| 180 |
"""
|
| 181 |
Merge multiple DataFrames using a two-step process:
|
| 182 |
-
1. Use the earnings file as
|
| 183 |
-
2. Merge template info onto earnings
|
| 184 |
-
|
|
|
|
| 185 |
"""
|
| 186 |
earnings_file = None
|
| 187 |
paye_file = None
|
| 188 |
template_file = None
|
| 189 |
|
| 190 |
-
# Identify files based on filename keywords
|
| 191 |
for file_info in processed_files:
|
| 192 |
lower_filename = file_info["filename"].lower()
|
| 193 |
if "earnings" in lower_filename:
|
|
@@ -196,16 +199,17 @@ def merge_with_master(processed_files):
|
|
| 196 |
paye_file = file_info
|
| 197 |
elif "template" in lower_filename:
|
| 198 |
template_file = file_info
|
|
|
|
| 199 |
if not earnings_file:
|
| 200 |
st.warning("No earnings file found as master. Using the first file as master.")
|
| 201 |
earnings_file = processed_files[0]
|
| 202 |
|
| 203 |
-
# Use the earnings DataFrame as the master
|
| 204 |
earnings_df = earnings_file["df"]
|
| 205 |
-
# Drop the inaccurate 'tin' column from earnings
|
| 206 |
if 'tin' in earnings_df.columns:
|
| 207 |
earnings_df = earnings_df.drop(columns=['tin'])
|
| 208 |
-
# Double-check removal of any middle_name column (should
|
| 209 |
if 'middle_name' in earnings_df.columns:
|
| 210 |
earnings_df = earnings_df.drop(columns=['middle_name'])
|
| 211 |
|
|
@@ -215,10 +219,10 @@ def merge_with_master(processed_files):
|
|
| 215 |
if template_file is not None:
|
| 216 |
st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
|
| 217 |
template_df = template_file["df"]
|
| 218 |
-
# Drop any middle_name column from the template file
|
| 219 |
if 'middle_name' in template_df.columns:
|
| 220 |
template_df = template_df.drop(columns=['middle_name'])
|
| 221 |
-
# Ensure template has an 'employee_name' column (
|
| 222 |
if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
|
| 223 |
template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
|
| 224 |
if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
|
|
@@ -226,16 +230,21 @@ def merge_with_master(processed_files):
|
|
| 226 |
else:
|
| 227 |
st.warning("Column 'employee_name' missing in either the earnings or template file. Skipping template merge.")
|
| 228 |
else:
|
| 229 |
-
st.warning("No template file detected.")
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
# Merge PAYE figures onto the merged DataFrame using 'tin'
|
| 232 |
if paye_file is not None:
|
| 233 |
st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
|
| 234 |
paye_df = paye_file["df"]
|
| 235 |
-
if 'tin' in
|
| 236 |
merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
|
| 237 |
else:
|
| 238 |
-
st.warning("Column 'tin' missing in
|
| 239 |
else:
|
| 240 |
st.warning("No PAYE file detected.")
|
| 241 |
|
|
@@ -268,7 +277,7 @@ def main():
|
|
| 268 |
if df.empty:
|
| 269 |
st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
|
| 270 |
continue
|
| 271 |
-
# Standardize
|
| 272 |
df = standardize_dataframe(df)
|
| 273 |
st.write("Initial Preview:")
|
| 274 |
st.dataframe(df.head())
|
|
@@ -277,7 +286,7 @@ def main():
|
|
| 277 |
if analysis:
|
| 278 |
st.write("Column Analysis:")
|
| 279 |
st.json(analysis)
|
| 280 |
-
# Apply suggested renames
|
| 281 |
if 'suggested_renames' in analysis:
|
| 282 |
df = df.rename(columns=analysis['suggested_renames'])
|
| 283 |
processed_files.append(
|
|
|
|
| 39 |
"""
|
| 40 |
Standardize DataFrame column names and data types:
|
| 41 |
- Drops any middle name columns.
|
| 42 |
+
- Cleans all column names (e.g., "Employee Name" becomes "employee_name").
|
| 43 |
+
- Renames synonyms to common names (e.g., mapping TIN-related columns to 'tin'
|
| 44 |
+
and salary-related columns to 'salary').
|
| 45 |
- Creates an 'employee_name' column if missing but first_name and last_name exist.
|
| 46 |
- Combines duplicate key columns into one.
|
| 47 |
+
- Forces key columns (tin and employee_name) to be strings.
|
| 48 |
"""
|
| 49 |
# Drop any column that appears to be a middle name
|
| 50 |
middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
|
| 51 |
if middle_name_cols:
|
| 52 |
df = df.drop(columns=middle_name_cols)
|
| 53 |
|
| 54 |
+
# Clean all column names so that "Employee Name" becomes "employee_name", etc.
|
| 55 |
df.columns = [clean_column_name(col) for col in df.columns]
|
| 56 |
|
| 57 |
+
# Build a rename map for TIN and salary synonyms.
|
| 58 |
+
# Note: This will capture PAYE's "tin_or_personal_id_of_employee" too.
|
| 59 |
rename_map = {}
|
| 60 |
for col in df.columns:
|
| 61 |
if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
|
| 62 |
rename_map[col] = 'tin'
|
| 63 |
+
elif 'tin' in col and 'tin' not in rename_map.get(col, ''):
|
| 64 |
rename_map[col] = 'tin'
|
| 65 |
if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
|
| 66 |
rename_map[col] = 'salary'
|
| 67 |
if rename_map:
|
| 68 |
df = df.rename(columns=rename_map)
|
| 69 |
|
| 70 |
+
# Combine duplicate columns for salary and tin if needed
|
| 71 |
if 'salary' in df.columns and list(df.columns).count('salary') > 1:
|
| 72 |
salary_cols = [col for col in df.columns if col == 'salary']
|
| 73 |
df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
|
|
|
|
| 77 |
df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
|
| 78 |
df = df.loc[:, ~df.columns.duplicated()]
|
| 79 |
|
| 80 |
+
# If employee_name is missing but first_name and last_name exist, create it.
|
| 81 |
if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
|
| 82 |
df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
|
| 83 |
|
| 84 |
+
# Ensure key columns are of the proper type.
|
| 85 |
if 'salary' in df.columns:
|
| 86 |
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
|
| 87 |
if 'tin' in df.columns:
|
|
|
|
| 181 |
def merge_with_master(processed_files):
|
| 182 |
"""
|
| 183 |
Merge multiple DataFrames using a two-step process:
|
| 184 |
+
1. Use the earnings file as master and drop its inaccurate 'tin' column.
|
| 185 |
+
2. Merge template info onto earnings using 'employee_name' (the key provided by "Employee Name").
|
| 186 |
+
The trusted 'tin' comes from the template file.
|
| 187 |
+
3. Merge the combined earnings–template DataFrame with the PAYE file using 'tin'.
|
| 188 |
"""
|
| 189 |
earnings_file = None
|
| 190 |
paye_file = None
|
| 191 |
template_file = None
|
| 192 |
|
| 193 |
+
# Identify files based on filename keywords.
|
| 194 |
for file_info in processed_files:
|
| 195 |
lower_filename = file_info["filename"].lower()
|
| 196 |
if "earnings" in lower_filename:
|
|
|
|
| 199 |
paye_file = file_info
|
| 200 |
elif "template" in lower_filename:
|
| 201 |
template_file = file_info
|
| 202 |
+
|
| 203 |
if not earnings_file:
|
| 204 |
st.warning("No earnings file found as master. Using the first file as master.")
|
| 205 |
earnings_file = processed_files[0]
|
| 206 |
|
| 207 |
+
# Use the earnings DataFrame as the master.
|
| 208 |
earnings_df = earnings_file["df"]
|
| 209 |
+
# Drop the inaccurate 'tin' column from earnings if it exists.
|
| 210 |
if 'tin' in earnings_df.columns:
|
| 211 |
earnings_df = earnings_df.drop(columns=['tin'])
|
| 212 |
+
# Double-check removal of any middle_name column (should be done in standardization).
|
| 213 |
if 'middle_name' in earnings_df.columns:
|
| 214 |
earnings_df = earnings_df.drop(columns=['middle_name'])
|
| 215 |
|
|
|
|
| 219 |
if template_file is not None:
|
| 220 |
st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
|
| 221 |
template_df = template_file["df"]
|
| 222 |
+
# Drop any middle_name column from the template file.
|
| 223 |
if 'middle_name' in template_df.columns:
|
| 224 |
template_df = template_df.drop(columns=['middle_name'])
|
| 225 |
+
# Ensure template has an 'employee_name' column (construct if necessary).
|
| 226 |
if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
|
| 227 |
template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
|
| 228 |
if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
|
|
|
|
| 230 |
else:
|
| 231 |
st.warning("Column 'employee_name' missing in either the earnings or template file. Skipping template merge.")
|
| 232 |
else:
|
| 233 |
+
st.warning("No template file detected. Cannot proceed without a trusted TIN from the template.")
|
| 234 |
|
| 235 |
+
# After merging, check that a trusted 'tin' is present from the template.
|
| 236 |
+
if 'tin' not in merged_df.columns:
|
| 237 |
+
st.error("No trusted 'tin' column found in the merged earnings-template data. Aborting further merge.")
|
| 238 |
+
return merged_df
|
| 239 |
+
|
| 240 |
# Merge PAYE figures onto the merged DataFrame using 'tin'
|
| 241 |
if paye_file is not None:
|
| 242 |
st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
|
| 243 |
paye_df = paye_file["df"]
|
| 244 |
+
if 'tin' in paye_df.columns:
|
| 245 |
merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
|
| 246 |
else:
|
| 247 |
+
st.warning("Column 'tin' missing in the PAYE file. Skipping PAYE merge.")
|
| 248 |
else:
|
| 249 |
st.warning("No PAYE file detected.")
|
| 250 |
|
|
|
|
| 277 |
if df.empty:
|
| 278 |
st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
|
| 279 |
continue
|
| 280 |
+
# Standardize columns and key identifiers.
|
| 281 |
df = standardize_dataframe(df)
|
| 282 |
st.write("Initial Preview:")
|
| 283 |
st.dataframe(df.head())
|
|
|
|
| 286 |
if analysis:
|
| 287 |
st.write("Column Analysis:")
|
| 288 |
st.json(analysis)
|
| 289 |
+
# Apply any suggested renames from the analysis.
|
| 290 |
if 'suggested_renames' in analysis:
|
| 291 |
df = df.rename(columns=analysis['suggested_renames'])
|
| 292 |
processed_files.append(
|