rairo commited on
Commit
2076af3
·
verified ·
1 Parent(s): 5e8d055

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -7
app.py CHANGED
@@ -27,6 +27,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
27
  - Renames synonyms to common names (e.g., tin, salary).
28
  - Creates an employee_name column if missing but first_name and last_name exist.
29
  - Converts the salary column to numeric.
 
30
  """
31
  rename_map = {}
32
 
@@ -38,7 +39,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
38
  elif 'tin' in col_lower:
39
  rename_map[col] = 'tin'
40
 
41
- # Standardize salary columns (e.g., current_salary_wages_fees_commissions_etc_regular_earnings)
42
  for col in df.columns:
43
  col_lower = col.lower()
44
  if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
@@ -47,11 +48,24 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
47
  if rename_map:
48
  df = df.rename(columns=rename_map)
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Create employee_name if not present but first_name and last_name exist
51
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
52
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
53
 
54
- # Ensure salary column is numeric (to avoid pyarrow conversion errors)
55
  if 'salary' in df.columns:
56
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
57
 
@@ -148,7 +162,6 @@ def merge_with_master(processed_files):
148
  master_file = None
149
  other_files = []
150
 
151
- # Identify the master file by checking for 'earnings' in the filename
152
  for file_info in processed_files:
153
  if "earnings" in file_info["filename"].lower():
154
  master_file = file_info
@@ -163,16 +176,13 @@ def merge_with_master(processed_files):
163
  master_df = master_file["df"]
164
  st.write(f"Using '{master_file['filename']}' as master for merging.")
165
 
166
- # Define default key columns for merging
167
  default_keys = ['tin', 'employee_name']
168
  merged_df = master_df
169
 
170
  for other in other_files:
171
  other_df = other["df"]
172
- # Try to use default keys if they exist in both
173
  keys_to_use = [key for key in default_keys if key in other_df.columns and key in merged_df.columns]
174
  if not keys_to_use:
175
- # Fallback: use intersection of columns if default keys aren't found
176
  keys_to_use = list(set(merged_df.columns).intersection(set(other_df.columns)))
177
  if keys_to_use:
178
  st.write(f"Merging '{other['filename']}' on keys: {keys_to_use}")
@@ -210,7 +220,6 @@ def main():
210
  df = pd.read_csv(uploaded_file)
211
 
212
  if df is not None:
213
- # Clean and standardize column names and data types
214
  df.columns = [clean_column_name(col) for col in df.columns]
215
  df = standardize_dataframe(df)
216
 
 
27
  - Renames synonyms to common names (e.g., tin, salary).
28
  - Creates an employee_name column if missing but first_name and last_name exist.
29
  - Converts the salary column to numeric.
30
+ - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
31
  """
32
  rename_map = {}
33
 
 
39
  elif 'tin' in col_lower:
40
  rename_map[col] = 'tin'
41
 
42
+ # Standardize salary columns
43
  for col in df.columns:
44
  col_lower = col.lower()
45
  if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
 
48
  if rename_map:
49
  df = df.rename(columns=rename_map)
50
 
51
+ # Combine duplicate columns for 'salary'
52
+ if 'salary' in df.columns and list(df.columns).count('salary') > 1:
53
+ salary_cols = [col for col in df.columns if col == 'salary']
54
+ # Use backfill across the duplicate columns and take the first non-null value
55
+ df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
56
+ df = df.loc[:, ~df.columns.duplicated()]
57
+
58
+ # Combine duplicate columns for 'tin'
59
+ if 'tin' in df.columns and list(df.columns).count('tin') > 1:
60
+ tin_cols = [col for col in df.columns if col == 'tin']
61
+ df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
62
+ df = df.loc[:, ~df.columns.duplicated()]
63
+
64
  # Create employee_name if not present but first_name and last_name exist
65
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
66
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
67
 
68
+ # Ensure salary column is numeric (to avoid conversion errors)
69
  if 'salary' in df.columns:
70
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
71
 
 
162
  master_file = None
163
  other_files = []
164
 
 
165
  for file_info in processed_files:
166
  if "earnings" in file_info["filename"].lower():
167
  master_file = file_info
 
176
  master_df = master_file["df"]
177
  st.write(f"Using '{master_file['filename']}' as master for merging.")
178
 
 
179
  default_keys = ['tin', 'employee_name']
180
  merged_df = master_df
181
 
182
  for other in other_files:
183
  other_df = other["df"]
 
184
  keys_to_use = [key for key in default_keys if key in other_df.columns and key in merged_df.columns]
185
  if not keys_to_use:
 
186
  keys_to_use = list(set(merged_df.columns).intersection(set(other_df.columns)))
187
  if keys_to_use:
188
  st.write(f"Merging '{other['filename']}' on keys: {keys_to_use}")
 
220
  df = pd.read_csv(uploaded_file)
221
 
222
  if df is not None:
 
223
  df.columns = [clean_column_name(col) for col in df.columns]
224
  df = standardize_dataframe(df)
225