rairo commited on
Commit
d05ef69
·
verified ·
1 Parent(s): d6d231e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -17
app.py CHANGED
@@ -23,8 +23,7 @@ def clean_column_name(col_name):
23
 
24
  def clean_tin_value(val):
25
  """
26
- Clean the TIN value by stripping whitespace and,
27
- if it ends with '.0', converting it to an integer string.
28
  """
29
  val_str = str(val).strip()
30
  if val_str.endswith('.0'):
@@ -38,24 +37,22 @@ def clean_tin_value(val):
38
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
39
  """
40
  Standardize DataFrame column names and data types.
41
- - Renames synonyms to common names (e.g., tin, salary).
42
- - Creates an employee_name column if missing but first_name and last_name exist.
 
43
  - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
44
- - Cleans the key columns 'tin' and 'employee_name' for consistency.
45
  """
46
  rename_map = {}
47
 
48
- # Standardize TIN-related columns
49
  for col in df.columns:
50
  col_lower = col.lower()
51
- if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid']:
 
52
  rename_map[col] = 'tin'
53
  elif 'tin' in col_lower:
54
  rename_map[col] = 'tin'
55
-
56
- # Standardize salary columns
57
- for col in df.columns:
58
- col_lower = col.lower()
59
  if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
60
  rename_map[col] = 'salary'
61
 
@@ -78,16 +75,15 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
78
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
79
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
80
 
81
- # Ensure salary column is numeric (to avoid conversion errors)
82
  if 'salary' in df.columns:
83
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
84
 
85
- # Clean key columns:
86
  if 'tin' in df.columns:
87
- # First, cast to string then clean individual values
88
- df['tin'] = df['tin'].astype(str).apply(clean_tin_value)
89
  if 'employee_name' in df.columns:
90
- df['employee_name'] = df['employee_name'].astype(str).str.strip()
91
 
92
  return df
93
 
@@ -196,7 +192,6 @@ def merge_with_master(processed_files):
196
  master_df = master_file["df"]
197
  st.write(f"Using '{master_file['filename']}' as master for merging.")
198
 
199
- # Use both 'tin' and 'employee_name' if available, else fallback to common columns.
200
  default_keys = ['tin', 'employee_name']
201
  merged_df = master_df
202
 
 
23
 
24
  def clean_tin_value(val):
25
  """
26
+ Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
 
27
  """
28
  val_str = str(val).strip()
29
  if val_str.endswith('.0'):
 
37
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
38
  """
39
  Standardize DataFrame column names and data types.
40
+ - Renames synonyms to common names (e.g., 'tin', 'salary').
41
+ - In particular, any header containing 'personal_id_of_employee' (or similar) or 'tin' is renamed to 'tin'.
42
+ - Creates an 'employee_name' column if missing but first_name and last_name exist.
43
  - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
44
+ - Forces the key columns 'tin' and 'employee_name' to be strings.
45
  """
46
  rename_map = {}
47
 
 
48
  for col in df.columns:
49
  col_lower = col.lower()
50
+ # Rename headers to 'tin'
51
+ if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
52
  rename_map[col] = 'tin'
53
  elif 'tin' in col_lower:
54
  rename_map[col] = 'tin'
55
+ # Rename headers to 'salary'
 
 
 
56
  if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
57
  rename_map[col] = 'salary'
58
 
 
75
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
76
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
77
 
78
+ # Ensure salary column is numeric (to avoid conversion errors later)
79
  if 'salary' in df.columns:
80
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
81
 
82
+ # Force key columns to be strings, filling NaNs with empty strings
83
  if 'tin' in df.columns:
84
+ df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
 
85
  if 'employee_name' in df.columns:
86
+ df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
87
 
88
  return df
89
 
 
192
  master_df = master_file["df"]
193
  st.write(f"Using '{master_file['filename']}' as master for merging.")
194
 
 
195
  default_keys = ['tin', 'employee_name']
196
  merged_df = master_df
197