rairo commited on
Commit
fc7b3ea
·
verified ·
1 Parent(s): 9c17313

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -28
app.py CHANGED
@@ -13,7 +13,10 @@ genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
13
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
14
 
15
  def clean_column_name(col_name):
16
- """Clean column names to be compatible with Arrow."""
 
 
 
17
  if not isinstance(col_name, str):
18
  return str(col_name)
19
  cleaned = re.sub(r"[^\w\s]", " ", col_name)
@@ -21,7 +24,8 @@ def clean_column_name(col_name):
21
 
22
  def clean_tin_value(val):
23
  """
24
- Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
 
25
  """
26
  val_str = str(val).strip()
27
  if val_str.endswith('.0'):
@@ -33,29 +37,35 @@ def clean_tin_value(val):
33
 
34
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
35
  """
36
- Standardize DataFrame column names and data types.
 
 
37
  - Renames synonyms to common names (e.g., 'tin', 'salary').
38
  - Creates an 'employee_name' column if missing but first_name and last_name exist.
39
- - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
40
  - Forces the key columns 'tin' and 'employee_name' to be strings.
41
- - Drops any middle name columns.
42
  """
43
  # Drop any column that appears to be a middle name
44
  middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
45
  if middle_name_cols:
46
  df = df.drop(columns=middle_name_cols)
47
 
 
 
 
 
48
  rename_map = {}
49
  for col in df.columns:
50
- col_lower = col.lower()
51
- if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
52
  rename_map[col] = 'tin'
53
- elif 'tin' in col_lower:
54
  rename_map[col] = 'tin'
55
- if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
56
  rename_map[col] = 'salary'
57
  if rename_map:
58
  df = df.rename(columns=rename_map)
 
 
59
  if 'salary' in df.columns and list(df.columns).count('salary') > 1:
60
  salary_cols = [col for col in df.columns if col == 'salary']
61
  df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
@@ -64,18 +74,27 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
64
  tin_cols = [col for col in df.columns if col == 'tin']
65
  df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
66
  df = df.loc[:, ~df.columns.duplicated()]
 
 
67
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
68
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
 
 
69
  if 'salary' in df.columns:
70
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
71
  if 'tin' in df.columns:
72
  df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
73
  if 'employee_name' in df.columns:
74
  df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
 
75
  return df
76
 
77
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
78
- """Analyze DataFrame columns using Gemini AI with an updated prompt."""
 
 
 
 
79
  try:
80
  display_df = df.head(5).copy()
81
  for col in display_df.columns:
@@ -144,7 +163,10 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
144
  }
145
 
146
  def read_excel_file(file) -> pd.DataFrame:
147
- """Read Excel file with improved error handling."""
 
 
 
148
  try:
149
  return pd.read_excel(file, engine="openpyxl")
150
  except Exception as e1:
@@ -158,12 +180,14 @@ def merge_with_master(processed_files):
158
  """
159
  Merge multiple DataFrames using a two-step process:
160
  1. Use the earnings file as the master. Drop its inaccurate 'tin' column.
161
- 2. Merge template info onto earnings via 'employee_name' (constructed from first and last names).
162
  3. Then merge the resulting DataFrame with the PAYE file using the (correct) 'tin' key.
163
  """
164
  earnings_file = None
165
  paye_file = None
166
  template_file = None
 
 
167
  for file_info in processed_files:
168
  lower_filename = file_info["filename"].lower()
169
  if "earnings" in lower_filename:
@@ -176,20 +200,20 @@ def merge_with_master(processed_files):
176
  st.warning("No earnings file found as master. Using the first file as master.")
177
  earnings_file = processed_files[0]
178
 
179
- # Start with the earnings DataFrame as master
180
  earnings_df = earnings_file["df"]
181
  # Drop the inaccurate 'tin' column from earnings, if present
182
  if 'tin' in earnings_df.columns:
183
  earnings_df = earnings_df.drop(columns=['tin'])
184
- # Ensure any middle_name column is dropped (already handled in standardization, but double-check)
185
  if 'middle_name' in earnings_df.columns:
186
  earnings_df = earnings_df.drop(columns=['middle_name'])
187
 
188
  merged_df = earnings_df.copy()
189
 
190
- # Merge template info onto earnings using 'employee_name' as key
191
  if template_file is not None:
192
- st.write(f"Merging template info from '{template_file['filename']}' onto the earnings sheet using key 'employee_name'.")
193
  template_df = template_file["df"]
194
  # Drop any middle_name column from the template file
195
  if 'middle_name' in template_df.columns:
@@ -206,7 +230,7 @@ def merge_with_master(processed_files):
206
 
207
  # Merge PAYE figures onto the merged DataFrame using 'tin'
208
  if paye_file is not None:
209
- st.write(f"Merging PAYE figures from '{paye_file['filename']}' onto the merged sheet using key 'tin'.")
210
  paye_df = paye_file["df"]
211
  if 'tin' in merged_df.columns and 'tin' in paye_df.columns:
212
  merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
@@ -218,7 +242,10 @@ def merge_with_master(processed_files):
218
  return merged_df
219
 
220
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
221
- """Ensure DataFrame is safe for display in Streamlit."""
 
 
 
222
  return df.astype(str).replace({"nan": "", "None": ""})
223
 
224
  def main():
@@ -239,9 +266,9 @@ def main():
239
  df = pd.read_csv(uploaded_file)
240
  if df is not None:
241
  if df.empty:
242
- st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
243
  continue
244
- df.columns = [clean_column_name(col) for col in df.columns]
245
  df = standardize_dataframe(df)
246
  st.write("Initial Preview:")
247
  st.dataframe(df.head())
@@ -250,7 +277,7 @@ def main():
250
  if analysis:
251
  st.write("Column Analysis:")
252
  st.json(analysis)
253
- # Apply suggested renames if provided
254
  if 'suggested_renames' in analysis:
255
  df = df.rename(columns=analysis['suggested_renames'])
256
  processed_files.append(
@@ -281,13 +308,11 @@ def main():
281
  st.write(f"Total rows: {len(merged_df)}")
282
  st.write(f"Total columns: {len(merged_df.columns)}")
283
  st.write("### Data Quality Metrics")
284
- missing_df = pd.DataFrame(
285
- {
286
- "Column": merged_df.columns,
287
- "Missing Values": merged_df.isnull().sum().values,
288
- "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
289
- }
290
- )
291
  st.dataframe(missing_df)
292
  duplicates = merged_df.duplicated().sum()
293
  st.write(f"Number of duplicate rows: {duplicates}")
 
13
  model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
14
 
15
  def clean_column_name(col_name):
16
+ """
17
+ Clean column names to be compatible with Arrow.
18
+ Converts to lowercase and replaces non-alphanumeric characters with underscores.
19
+ """
20
  if not isinstance(col_name, str):
21
  return str(col_name)
22
  cleaned = re.sub(r"[^\w\s]", " ", col_name)
 
24
 
25
  def clean_tin_value(val):
26
  """
27
+ Clean the TIN value by stripping whitespace and, if it ends with '.0',
28
+ converting it to an integer string.
29
  """
30
  val_str = str(val).strip()
31
  if val_str.endswith('.0'):
 
37
 
38
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
39
  """
40
+ Standardize DataFrame column names and data types:
41
+ - Drops any middle name columns.
42
+ - Cleans all column names (e.g., "Employee Name" -> "employee_name").
43
  - Renames synonyms to common names (e.g., 'tin', 'salary').
44
  - Creates an 'employee_name' column if missing but first_name and last_name exist.
45
+ - Combines duplicate key columns into one.
46
  - Forces the key columns 'tin' and 'employee_name' to be strings.
 
47
  """
48
  # Drop any column that appears to be a middle name
49
  middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
50
  if middle_name_cols:
51
  df = df.drop(columns=middle_name_cols)
52
 
53
+ # Clean all column names first so that "Employee Name" becomes "employee_name"
54
+ df.columns = [clean_column_name(col) for col in df.columns]
55
+
56
+ # Rename columns based on synonyms for TIN and salary
57
  rename_map = {}
58
  for col in df.columns:
59
+ if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
 
60
  rename_map[col] = 'tin'
61
+ elif 'tin' in col:
62
  rename_map[col] = 'tin'
63
+ if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
64
  rename_map[col] = 'salary'
65
  if rename_map:
66
  df = df.rename(columns=rename_map)
67
+
68
+ # Combine duplicate columns (e.g., multiple salary or tin columns)
69
  if 'salary' in df.columns and list(df.columns).count('salary') > 1:
70
  salary_cols = [col for col in df.columns if col == 'salary']
71
  df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
 
74
  tin_cols = [col for col in df.columns if col == 'tin']
75
  df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
76
  df = df.loc[:, ~df.columns.duplicated()]
77
+
78
+ # If employee_name is missing and first_name and last_name exist, create it.
79
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
80
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
81
+
82
+ # Ensure key columns are of the correct type
83
  if 'salary' in df.columns:
84
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
85
  if 'tin' in df.columns:
86
  df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
87
  if 'employee_name' in df.columns:
88
  df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
89
+
90
  return df
91
 
92
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
93
+ """
94
+ Analyze DataFrame columns using Gemini AI.
95
+ Returns a JSON object with details about columns, key columns for merging,
96
+ any data quality issues, and suggested renames.
97
+ """
98
  try:
99
  display_df = df.head(5).copy()
100
  for col in display_df.columns:
 
163
  }
164
 
165
  def read_excel_file(file) -> pd.DataFrame:
166
+ """
167
+ Read an Excel file with error handling.
168
+ Tries openpyxl first and falls back to xlrd.
169
+ """
170
  try:
171
  return pd.read_excel(file, engine="openpyxl")
172
  except Exception as e1:
 
180
  """
181
  Merge multiple DataFrames using a two-step process:
182
  1. Use the earnings file as the master. Drop its inaccurate 'tin' column.
183
+ 2. Merge template info onto earnings via 'employee_name' (the key provided by "Employee Name").
184
  3. Then merge the resulting DataFrame with the PAYE file using the (correct) 'tin' key.
185
  """
186
  earnings_file = None
187
  paye_file = None
188
  template_file = None
189
+
190
+ # Identify files based on filename keywords
191
  for file_info in processed_files:
192
  lower_filename = file_info["filename"].lower()
193
  if "earnings" in lower_filename:
 
200
  st.warning("No earnings file found as master. Using the first file as master.")
201
  earnings_file = processed_files[0]
202
 
203
+ # Use the earnings DataFrame as the master
204
  earnings_df = earnings_file["df"]
205
  # Drop the inaccurate 'tin' column from earnings, if present
206
  if 'tin' in earnings_df.columns:
207
  earnings_df = earnings_df.drop(columns=['tin'])
208
+ # Double-check removal of any middle_name column (should already be done in standardization)
209
  if 'middle_name' in earnings_df.columns:
210
  earnings_df = earnings_df.drop(columns=['middle_name'])
211
 
212
  merged_df = earnings_df.copy()
213
 
214
+ # Merge template info onto earnings using 'employee_name'
215
  if template_file is not None:
216
+ st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
217
  template_df = template_file["df"]
218
  # Drop any middle_name column from the template file
219
  if 'middle_name' in template_df.columns:
 
230
 
231
  # Merge PAYE figures onto the merged DataFrame using 'tin'
232
  if paye_file is not None:
233
+ st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
234
  paye_df = paye_file["df"]
235
  if 'tin' in merged_df.columns and 'tin' in paye_df.columns:
236
  merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
 
242
  return merged_df
243
 
244
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
245
+ """
246
+ Prepare DataFrame for safe display in Streamlit by converting all entries to strings
247
+ and replacing common null placeholders.
248
+ """
249
  return df.astype(str).replace({"nan": "", "None": ""})
250
 
251
  def main():
 
266
  df = pd.read_csv(uploaded_file)
267
  if df is not None:
268
  if df.empty:
269
+ st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
270
  continue
271
+ # Standardize column names and key columns
272
  df = standardize_dataframe(df)
273
  st.write("Initial Preview:")
274
  st.dataframe(df.head())
 
277
  if analysis:
278
  st.write("Column Analysis:")
279
  st.json(analysis)
280
+ # Apply suggested renames if provided by the analysis
281
  if 'suggested_renames' in analysis:
282
  df = df.rename(columns=analysis['suggested_renames'])
283
  processed_files.append(
 
308
  st.write(f"Total rows: {len(merged_df)}")
309
  st.write(f"Total columns: {len(merged_df.columns)}")
310
  st.write("### Data Quality Metrics")
311
+ missing_df = pd.DataFrame({
312
+ "Column": merged_df.columns,
313
+ "Missing Values": merged_df.isnull().sum().values,
314
+ "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
315
+ })
 
 
316
  st.dataframe(missing_df)
317
  duplicates = merged_df.duplicated().sum()
318
  st.write(f"Number of duplicate rows: {duplicates}")