rairo commited on
Commit
c71c6c1
·
verified ·
1 Parent(s): 1f19aba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -53
app.py CHANGED
@@ -69,22 +69,22 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
69
  return df
70
 
71
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
72
- """Analyze DataFrame columns using Gemini AI with improved error handling and prompt."""
73
  try:
74
  display_df = df.head(5).copy()
75
  for col in display_df.columns:
76
  display_df[col] = display_df[col].astype(str)
77
  sample_csv = display_df.to_csv(index=False)
78
  prompt = f"""
79
- Analyze this CSV data, which represents employee payroll information, and provide analysis in JSON format.
80
 
81
  Filename: {filename}
82
  Sample data (first 5 rows):
83
  {sample_csv}
84
 
85
- In the context of merging datasets, "key columns" are columns that uniquely identify records and are essential for joining this data with other datasets. For payroll data, key columns are typically employee identifiers such as Employee ID, Taxpayer Identification Number (TIN), or Employee Name (if unique).
86
 
87
- Please analyze the columns in the sample data and identify potential key columns that can be used to merge this dataset with other employee-related datasets.
88
 
89
  Respond with ONLY a valid JSON object in the following format:
90
 
@@ -97,14 +97,14 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
97
  "description": "Brief description of the column and its likely content."
98
  }}
99
  ],
100
- "key_columns": ["List of identified key column names. Prioritize employee identifiers like employee_id, tin, or employee_name if they appear to be unique identifiers."],
101
  "issues": ["List any data quality issues found, like missing values in important columns."],
102
  "suggested_renames": {{
103
  "old_name": "new_name"
104
  }}
105
  }}
106
 
107
- Ensure the JSON response is valid and parsable. Focus on accurately identifying key columns relevant for merging payroll data.
108
  """
109
  response = model.generate_content(prompt)
110
  response_text = response.text.strip()
@@ -150,49 +150,48 @@ def read_excel_file(file) -> pd.DataFrame:
150
 
151
  def merge_with_master(processed_files):
152
  """
153
- Merge multiple DataFrames using the earnings schedule file as the master.
154
- The master file is identified by having 'earnings' in its filename (case insensitive).
155
- Other files are merged onto the master using key columns identified by AI analysis.
156
  """
157
  master_file = None
158
- other_files = []
 
159
  for file_info in processed_files:
160
- if "earnings" in file_info["filename"].lower():
 
161
  master_file = file_info
162
- else:
163
- other_files.append(file_info)
 
 
164
  if not master_file:
165
- st.warning("No master file with 'earnings' found. Using the first file as master.")
166
  master_file = processed_files[0]
167
- other_files = processed_files[1:]
168
-
169
- if not master_file: # Handle case where no files are processed correctly
170
- st.error("No master file could be determined. Merging cannot proceed.")
171
- return None
172
-
173
  master_df = master_file["df"]
174
- if master_df.empty: # Check if master_df is empty. If so, no point merging.
175
- st.warning(f"Master DataFrame '{master_file['filename']}' is empty. Merging will result in an empty DataFrame.")
176
- return pd.DataFrame() # Return empty DataFrame
177
-
178
- master_keys = master_file["analysis"].get("key_columns", [])
179
- st.write(f"Using '{master_file['filename']}' as master with key columns: {master_keys}")
180
- merged_df = master_df
181
-
182
-
183
- for other in other_files:
184
- other_df = other["df"]
185
- if other_df.empty: # Check if other_df is empty before merging
186
- st.warning(f"DataFrame '{other['filename']}' is empty. Skipping merge for this file.")
187
- continue
188
-
189
- other_keys = other["analysis"].get("key_columns", [])
190
- common_keys = list(set(master_keys).intersection(set(other_keys)))
191
- if common_keys:
192
- st.write(f"Merging '{other['filename']}' on keys: {common_keys}")
193
- merged_df = merged_df.merge(other_df, on=common_keys, how="left")
194
  else:
195
- st.warning(f"No common keys found for merging '{other['filename']}'. Skipping this file.")
 
 
 
196
  return merged_df
197
 
198
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
@@ -216,9 +215,9 @@ def main():
216
  else:
217
  df = pd.read_csv(uploaded_file)
218
  if df is not None:
219
- if df.empty: # Check if dataframe is empty immediately after reading.
220
  st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
221
- continue # Skip to next file
222
  df.columns = [clean_column_name(col) for col in df.columns]
223
  df = standardize_dataframe(df)
224
  st.write("Initial Preview:")
@@ -228,24 +227,23 @@ def main():
228
  if analysis:
229
  st.write("Column Analysis:")
230
  st.json(analysis)
231
- # Apply suggested renames
232
  if 'suggested_renames' in analysis:
233
  df = df.rename(columns=analysis['suggested_renames'])
234
  processed_files.append(
235
  {"filename": uploaded_file.name, "df": df, "analysis": analysis}
236
  )
237
  else:
238
- st.error(f"Could not read data from '{uploaded_file.name}'.") # Explicit error if read_excel_file returns None
239
-
240
  except Exception as e:
241
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
242
  continue
243
 
244
  if len(processed_files) > 1:
245
- if not any(file_info["df"].empty for file_info in processed_files): # Check if any processed dataframe is empty before merging
246
- st.write("### Merging DataFrames with Earnings Schedule as Master")
247
  merged_df = merge_with_master(processed_files)
248
- if merged_df is not None and not merged_df.empty: # Check merged_df is not None and not empty before displaying.
249
  st.write("### Preview of Merged Data")
250
  st.dataframe(safe_display_df(merged_df.head()))
251
  try:
@@ -265,18 +263,17 @@ def main():
265
  "Column": merged_df.columns,
266
  "Missing Values": merged_df.isnull().sum().values,
267
  "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
268
- }
269
- )
270
  st.dataframe(missing_df)
271
  duplicates = merged_df.duplicated().sum()
272
  st.write(f"Number of duplicate rows: {duplicates}")
273
  except Exception as e:
274
  st.error(f"Error preparing download: {str(e)}")
275
- elif merged_df is not None and merged_df.empty: # Explicitly handle empty merged dataframe case
276
  st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
277
  else:
278
  st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
279
-
280
  else:
281
  st.warning("Please upload at least 2 files to merge.")
282
 
 
69
  return df
70
 
71
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
72
+ """Analyze DataFrame columns using Gemini AI with an updated prompt."""
73
  try:
74
  display_df = df.head(5).copy()
75
  for col in display_df.columns:
76
  display_df[col] = display_df[col].astype(str)
77
  sample_csv = display_df.to_csv(index=False)
78
  prompt = f"""
79
+ Analyze this CSV data, which may represent an employee earnings schedule, PAYE figures, or template info containing additional details for payroll processing. Provide an analysis in JSON format.
80
 
81
  Filename: {filename}
82
  Sample data (first 5 rows):
83
  {sample_csv}
84
 
85
+ For merging these datasets, key columns are essential. The earnings schedule is considered the master file, and PAYE figures and template info should be merged into it using common identifiers such as Tax Identification Number (TIN), Employee ID, or Employee Name if unique.
86
 
87
+ Please analyze the columns in the sample data and identify potential key columns for merging. Also, report any data quality issues and suggest renames to standardize the column names.
88
 
89
  Respond with ONLY a valid JSON object in the following format:
90
 
 
97
  "description": "Brief description of the column and its likely content."
98
  }}
99
  ],
100
+ "key_columns": ["List of identified key column names. Prioritize employee identifiers like employee_id, tin, or employee_name."],
101
  "issues": ["List any data quality issues found, like missing values in important columns."],
102
  "suggested_renames": {{
103
  "old_name": "new_name"
104
  }}
105
  }}
106
 
107
+ Ensure the JSON response is valid and parsable.
108
  """
109
  response = model.generate_content(prompt)
110
  response_text = response.text.strip()
 
150
 
151
  def merge_with_master(processed_files):
152
  """
153
+ Merge multiple DataFrames by using the earnings schedule file as the master.
154
+ This modified logic looks for files whose names include 'earnings', 'paye', or 'template'.
155
+ The PAYE figures and the template info are merged onto the earnings sheet using the 'tin' key.
156
  """
157
  master_file = None
158
+ paye_file = None
159
+ template_file = None
160
  for file_info in processed_files:
161
+ lower_filename = file_info["filename"].lower()
162
+ if "earnings" in lower_filename:
163
  master_file = file_info
164
+ elif "paye" in lower_filename:
165
+ paye_file = file_info
166
+ elif "template" in lower_filename:
167
+ template_file = file_info
168
  if not master_file:
169
+ st.warning("No earnings file found as master. Using the first file as master.")
170
  master_file = processed_files[0]
171
+
 
 
 
 
 
172
  master_df = master_file["df"]
173
+ merged_df = master_df.copy()
174
+
175
+ # Merge PAYE figures onto the earnings sheet
176
+ if paye_file is not None:
177
+ st.write(f"Merging PAYE figures from '{paye_file['filename']}' onto the earnings sheet using key 'tin'.")
178
+ if 'tin' in merged_df.columns and 'tin' in paye_file["df"].columns:
179
+ merged_df = merged_df.merge(paye_file["df"], on='tin', how='left', suffixes=('', '_paye'))
180
+ else:
181
+ st.warning("Column 'tin' missing in either the earnings or PAYE file. Skipping PAYE merge.")
182
+ else:
183
+ st.warning("No PAYE file detected.")
184
+
185
+ # Merge template info onto the earnings sheet
186
+ if template_file is not None:
187
+ st.write(f"Merging template info from '{template_file['filename']}' onto the earnings sheet using key 'tin'.")
188
+ if 'tin' in merged_df.columns and 'tin' in template_file["df"].columns:
189
+ merged_df = merged_df.merge(template_file["df"], on='tin', how='left', suffixes=('', '_template'))
 
 
 
190
  else:
191
+ st.warning("Column 'tin' missing in either the earnings or template file. Skipping template merge.")
192
+ else:
193
+ st.warning("No template file detected.")
194
+
195
  return merged_df
196
 
197
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
 
215
  else:
216
  df = pd.read_csv(uploaded_file)
217
  if df is not None:
218
+ if df.empty:
219
  st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
220
+ continue
221
  df.columns = [clean_column_name(col) for col in df.columns]
222
  df = standardize_dataframe(df)
223
  st.write("Initial Preview:")
 
227
  if analysis:
228
  st.write("Column Analysis:")
229
  st.json(analysis)
230
+ # Apply suggested renames if provided
231
  if 'suggested_renames' in analysis:
232
  df = df.rename(columns=analysis['suggested_renames'])
233
  processed_files.append(
234
  {"filename": uploaded_file.name, "df": df, "analysis": analysis}
235
  )
236
  else:
237
+ st.error(f"Could not read data from '{uploaded_file.name}'.")
 
238
  except Exception as e:
239
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
240
  continue
241
 
242
  if len(processed_files) > 1:
243
+ if not any(file_info["df"].empty for file_info in processed_files):
244
+ st.write("### Merging DataFrames (Earnings as Master)")
245
  merged_df = merge_with_master(processed_files)
246
+ if merged_df is not None and not merged_df.empty:
247
  st.write("### Preview of Merged Data")
248
  st.dataframe(safe_display_df(merged_df.head()))
249
  try:
 
263
  "Column": merged_df.columns,
264
  "Missing Values": merged_df.isnull().sum().values,
265
  "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
266
+ }
267
+ )
268
  st.dataframe(missing_df)
269
  duplicates = merged_df.duplicated().sum()
270
  st.write(f"Number of duplicate rows: {duplicates}")
271
  except Exception as e:
272
  st.error(f"Error preparing download: {str(e)}")
273
+ elif merged_df is not None and merged_df.empty:
274
  st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
275
  else:
276
  st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
 
277
  else:
278
  st.warning("Please upload at least 2 files to merge.")
279