rairo commited on
Commit
d6d231e
·
verified ·
1 Parent(s): 55b92d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -4
app.py CHANGED
@@ -21,13 +21,27 @@ def clean_column_name(col_name):
21
  return re.sub(r"\s+", "_", cleaned.strip().lower())
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
25
  """
26
  Standardize DataFrame column names and data types.
27
  - Renames synonyms to common names (e.g., tin, salary).
28
  - Creates an employee_name column if missing but first_name and last_name exist.
29
  - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
30
- - Ensures key columns 'tin' and 'employee_name' are strings.
31
  """
32
  rename_map = {}
33
 
@@ -68,11 +82,12 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
68
  if 'salary' in df.columns:
69
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
70
 
71
- # Ensure key columns are strings for consistency
72
  if 'tin' in df.columns:
73
- df['tin'] = df['tin'].astype(str)
 
74
  if 'employee_name' in df.columns:
75
- df['employee_name'] = df['employee_name'].astype(str)
76
 
77
  return df
78
 
@@ -181,6 +196,7 @@ def merge_with_master(processed_files):
181
  master_df = master_file["df"]
182
  st.write(f"Using '{master_file['filename']}' as master for merging.")
183
 
 
184
  default_keys = ['tin', 'employee_name']
185
  merged_df = master_df
186
 
 
21
  return re.sub(r"\s+", "_", cleaned.strip().lower())
22
 
23
 
24
+ def clean_tin_value(val):
25
+ """
26
+ Clean the TIN value by stripping whitespace and,
27
+ if it ends with '.0', converting it to an integer string.
28
+ """
29
+ val_str = str(val).strip()
30
+ if val_str.endswith('.0'):
31
+ try:
32
+ return str(int(float(val_str)))
33
+ except Exception:
34
+ return val_str
35
+ return val_str
36
+
37
+
38
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
39
  """
40
  Standardize DataFrame column names and data types.
41
  - Renames synonyms to common names (e.g., tin, salary).
42
  - Creates an employee_name column if missing but first_name and last_name exist.
43
  - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
44
+ - Cleans the key columns 'tin' and 'employee_name' for consistency.
45
  """
46
  rename_map = {}
47
 
 
82
  if 'salary' in df.columns:
83
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
84
 
85
+ # Clean key columns:
86
  if 'tin' in df.columns:
87
+ # First, cast to string then clean individual values
88
+ df['tin'] = df['tin'].astype(str).apply(clean_tin_value)
89
  if 'employee_name' in df.columns:
90
+ df['employee_name'] = df['employee_name'].astype(str).str.strip()
91
 
92
  return df
93
 
 
196
  master_df = master_file["df"]
197
  st.write(f"Using '{master_file['filename']}' as master for merging.")
198
 
199
+ # Use both 'tin' and 'employee_name' if available, else fallback to common columns.
200
  default_keys = ['tin', 'employee_name']
201
  merged_df = master_df
202