rairo commited on
Commit
55b92d6
·
verified ·
1 Parent(s): 2076af3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -2
app.py CHANGED
@@ -26,8 +26,8 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
26
  Standardize DataFrame column names and data types.
27
  - Renames synonyms to common names (e.g., tin, salary).
28
  - Creates an employee_name column if missing but first_name and last_name exist.
29
- - Converts the salary column to numeric.
30
  - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
 
31
  """
32
  rename_map = {}
33
 
@@ -51,7 +51,6 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
51
  # Combine duplicate columns for 'salary'
52
  if 'salary' in df.columns and list(df.columns).count('salary') > 1:
53
  salary_cols = [col for col in df.columns if col == 'salary']
54
- # Use backfill across the duplicate columns and take the first non-null value
55
  df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
56
  df = df.loc[:, ~df.columns.duplicated()]
57
 
@@ -69,6 +68,12 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
69
  if 'salary' in df.columns:
70
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
71
 
 
 
 
 
 
 
72
  return df
73
 
74
 
 
26
  Standardize DataFrame column names and data types.
27
  - Renames synonyms to common names (e.g., tin, salary).
28
  - Creates an employee_name column if missing but first_name and last_name exist.
 
29
  - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
30
+ - Ensures key columns 'tin' and 'employee_name' are strings.
31
  """
32
  rename_map = {}
33
 
 
51
  # Combine duplicate columns for 'salary'
52
  if 'salary' in df.columns and list(df.columns).count('salary') > 1:
53
  salary_cols = [col for col in df.columns if col == 'salary']
 
54
  df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
55
  df = df.loc[:, ~df.columns.duplicated()]
56
 
 
68
  if 'salary' in df.columns:
69
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
70
 
71
+ # Ensure key columns are strings for consistency
72
+ if 'tin' in df.columns:
73
+ df['tin'] = df['tin'].astype(str)
74
+ if 'employee_name' in df.columns:
75
+ df['employee_name'] = df['employee_name'].astype(str)
76
+
77
  return df
78
 
79