rairo commited on
Commit
8018c05
·
verified ·
1 Parent(s): d2c0f12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -98
app.py CHANGED
@@ -14,8 +14,7 @@ model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
14
 
15
  def clean_column_name(col_name):
16
  """
17
- Clean column names to be compatible with Arrow.
18
- Converts to lowercase and replaces non-alphanumeric characters with underscores.
19
  """
20
  if not isinstance(col_name, str):
21
  return str(col_name)
@@ -38,23 +37,21 @@ def clean_tin_value(val):
38
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
39
  """
40
  Standardize DataFrame column names and data types:
41
- - Drops any middle name columns.
42
- - Cleans all column names (e.g., "Employee Name" becomes "employee_name").
43
- - Renames synonyms to common names (e.g., mapping TIN-related columns to 'tin'
44
- and salary-related columns to 'salary').
45
- - Creates an 'employee_name' column if missing but first_name and last_name exist.
46
- - Combines duplicate key columns into one.
47
- - Forces key columns (tin and employee_name) to be strings.
48
  """
49
- # Drop any column that appears to be a middle name
50
  middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
51
  if middle_name_cols:
52
  df = df.drop(columns=middle_name_cols)
53
 
54
- # Clean all column names so that "Employee Name" becomes "employee_name", etc.
55
  df.columns = [clean_column_name(col) for col in df.columns]
56
 
57
- # Build a rename map for TIN and salary synonyms.
58
  rename_map = {}
59
  for col in df.columns:
60
  if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
@@ -66,7 +63,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
66
  if rename_map:
67
  df = df.rename(columns=rename_map)
68
 
69
- # Combine duplicate columns for salary and tin if needed
70
  if 'salary' in df.columns and list(df.columns).count('salary') > 1:
71
  salary_cols = [col for col in df.columns if col == 'salary']
72
  df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
@@ -76,11 +73,11 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
76
  df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
77
  df = df.loc[:, ~df.columns.duplicated()]
78
 
79
- # If employee_name is missing but first_name and last_name exist, create it.
80
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
81
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
82
 
83
- # Ensure key columns are of the proper type.
84
  if 'salary' in df.columns:
85
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
86
  if 'tin' in df.columns:
@@ -92,9 +89,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
92
 
93
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
94
  """
95
- Analyze DataFrame columns using Gemini AI.
96
- Returns a JSON object with details about columns, key columns for merging,
97
- any data quality issues, and suggested renames.
98
  """
99
  try:
100
  display_df = df.head(5).copy()
@@ -102,35 +97,12 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
102
  display_df[col] = display_df[col].astype(str)
103
  sample_csv = display_df.to_csv(index=False)
104
  prompt = f"""
105
- Analyze this CSV data, which may represent an employee earnings schedule, PAYE figures, or template info containing additional details for payroll processing. Provide an analysis in JSON format.
106
-
107
  Filename: {filename}
108
  Sample data (first 5 rows):
109
  {sample_csv}
110
-
111
- For merging these datasets, key columns are essential. The earnings schedule is considered the master file, and PAYE figures and template info should be merged into it using common identifiers such as Tax Identification Number (TIN), Employee ID, or Employee Name if unique.
112
-
113
- Please analyze the columns in the sample data and identify potential key columns for merging. Also, report any data quality issues and suggest renames to standardize the column names.
114
-
115
- Respond with ONLY a valid JSON object in the following format:
116
-
117
- {{
118
- "subject": "Employee payroll data analysis",
119
- "columns": [
120
- {{
121
- "name": "column_name",
122
- "type": "string/number/date",
123
- "description": "Brief description of the column and its likely content."
124
- }}
125
- ],
126
- "key_columns": ["List of identified key column names. Prioritize employee identifiers like employee_id, tin, or employee_name."],
127
- "issues": ["List any data quality issues found, like missing values in important columns."],
128
- "suggested_renames": {{
129
- "old_name": "new_name"
130
- }}
131
- }}
132
-
133
- Ensure the JSON response is valid and parsable.
134
  """
135
  response = model.generate_content(prompt)
136
  response_text = response.text.strip()
@@ -146,27 +118,14 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
146
  st.error(f"JSON parsing error: {str(je)}")
147
  st.text("Raw response:")
148
  st.text(response_text)
149
- return {
150
- "subject": "Error parsing analysis",
151
- "columns": [],
152
- "key_columns": [],
153
- "issues": ["Error analyzing columns"],
154
- "suggested_renames": {},
155
- }
156
  except Exception as e:
157
  st.error(f"Error in column analysis: {str(e)}")
158
- return {
159
- "subject": "Error in analysis",
160
- "columns": [],
161
- "key_columns": [],
162
- "issues": [str(e)],
163
- "suggested_renames": {},
164
- }
165
 
166
  def read_excel_file(file) -> pd.DataFrame:
167
  """
168
  Read an Excel file with error handling.
169
- Tries openpyxl first and falls back to xlrd.
170
  """
171
  try:
172
  return pd.read_excel(file, engine="openpyxl")
@@ -179,19 +138,17 @@ def read_excel_file(file) -> pd.DataFrame:
179
 
180
  def merge_with_master(processed_files):
181
  """
182
- Merge multiple DataFrames using a two-step process:
183
- 1. Use the earnings file as master and drop its inaccurate 'tin' column.
184
- 2. Merge template info onto earnings using 'employee_name' (the key provided by "Employee Name").
185
- The trusted 'tin' comes from the template file.
186
- For the template file, force its first column (which is "Personal ID of Employee") to be 'tin'.
187
- 3. Check that the merged earnings-template data has a 'tin' column populated.
188
- If present, merge the resulting DataFrame with the PAYE file using 'tin'.
189
  """
190
  earnings_file = None
191
  paye_file = None
192
  template_file = None
193
 
194
- # Identify files based on filename keywords.
195
  for file_info in processed_files:
196
  lower_filename = file_info["filename"].lower()
197
  if "earnings" in lower_filename:
@@ -205,49 +162,43 @@ def merge_with_master(processed_files):
205
  st.warning("No earnings file found as master. Using the first file as master.")
206
  earnings_file = processed_files[0]
207
 
208
- # Use the earnings DataFrame as the master.
209
  earnings_df = earnings_file["df"]
210
- # Drop the inaccurate 'tin' column from earnings if it exists.
211
  if 'tin' in earnings_df.columns:
212
  earnings_df = earnings_df.drop(columns=['tin'])
213
- # Remove any middle_name column.
214
  if 'middle_name' in earnings_df.columns:
215
  earnings_df = earnings_df.drop(columns=['middle_name'])
216
 
217
  merged_df = earnings_df.copy()
218
 
219
- # Merge template info onto earnings using 'employee_name'
220
  if template_file is not None:
221
  st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
222
  template_df = template_file["df"].copy()
223
- # Force the first column of the template file to be 'tin'
224
  if not template_df.empty:
225
  cols = list(template_df.columns)
226
  cols[0] = "tin"
227
  template_df.columns = cols
228
-
229
- # Remove any middle_name column from the template file.
230
  if 'middle_name' in template_df.columns:
231
  template_df = template_df.drop(columns=['middle_name'])
232
- # Ensure the template has an 'employee_name' column.
233
  if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
234
  template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
235
- # If after standardization the template still doesn't have employee_name,
236
- # you may need to construct it manually if possible.
237
  if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
238
  merged_df = merged_df.merge(template_df, on='employee_name', how='left', suffixes=('', '_template'))
239
  else:
240
- st.warning("Column 'employee_name' missing in either the earnings or template file. Skipping template merge.")
241
  else:
242
- st.warning("No template file detected. Cannot proceed without a trusted TIN from the template.")
243
 
244
- # Check that a trusted 'tin' column exists from the template merge.
245
  if 'tin' not in merged_df.columns or merged_df['tin'].isnull().all():
246
  st.error("No trusted 'tin' column found in the merged earnings-template data. Aborting further merge. "
247
- "Please ensure the template file's first column holds the trusted TIN and is properly standardized.")
248
  return merged_df
249
 
250
- # Merge PAYE figures onto the merged DataFrame using 'tin'
251
  if paye_file is not None:
252
  st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
253
  paye_df = paye_file["df"]
@@ -262,17 +213,14 @@ def merge_with_master(processed_files):
262
 
263
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
264
  """
265
- Prepare DataFrame for safe display in Streamlit by converting all entries to strings
266
- and replacing common null placeholders.
267
  """
268
  return df.astype(str).replace({"nan": "", "None": ""})
269
 
270
  def main():
271
  st.title("Smart CSV Processor")
272
  st.write("Upload CSV or Excel files for intelligent analysis and merging.")
273
- uploaded_files = st.file_uploader(
274
- "Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"]
275
- )
276
  if uploaded_files:
277
  st.write("### Processing Files")
278
  processed_files = []
@@ -287,7 +235,6 @@ def main():
287
  if df.empty:
288
  st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
289
  continue
290
- # Standardize columns and key identifiers.
291
  df = standardize_dataframe(df)
292
  st.write("Initial Preview:")
293
  st.dataframe(df.head())
@@ -296,12 +243,9 @@ def main():
296
  if analysis:
297
  st.write("Column Analysis:")
298
  st.json(analysis)
299
- # Apply any suggested renames from the analysis.
300
  if 'suggested_renames' in analysis:
301
  df = df.rename(columns=analysis['suggested_renames'])
302
- processed_files.append(
303
- {"filename": uploaded_file.name, "df": df, "analysis": analysis}
304
- )
305
  else:
306
  st.error(f"Could not read data from '{uploaded_file.name}'.")
307
  except Exception as e:
@@ -317,12 +261,7 @@ def main():
317
  st.dataframe(safe_display_df(merged_df.head()))
318
  try:
319
  csv = merged_df.to_csv(index=False)
320
- st.download_button(
321
- label="Download Merged CSV",
322
- data=csv,
323
- file_name="merged_data.csv",
324
- mime="text/csv",
325
- )
326
  st.write("### Dataset Statistics")
327
  st.write(f"Total rows: {len(merged_df)}")
328
  st.write(f"Total columns: {len(merged_df.columns)}")
 
14
 
15
  def clean_column_name(col_name):
16
  """
17
+ Clean column names: convert to lowercase, replace non-alphanumeric characters with underscores.
 
18
  """
19
  if not isinstance(col_name, str):
20
  return str(col_name)
 
37
  def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
38
  """
39
  Standardize DataFrame column names and data types:
40
+ - Drop any middle name columns.
41
+ - Clean column names (e.g. "Employee Name" becomes "employee_name").
42
+ - Rename synonyms (e.g., "Personal ID of Employee" to "tin").
43
+ - If missing, construct an 'employee_name' column from first and last names.
44
+ - Ensure key columns (tin and employee_name) are strings.
 
 
45
  """
46
+ # Drop columns containing 'middle_name'
47
  middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
48
  if middle_name_cols:
49
  df = df.drop(columns=middle_name_cols)
50
 
51
+ # Clean all column names
52
  df.columns = [clean_column_name(col) for col in df.columns]
53
 
54
+ # Rename synonyms for TIN and salary
55
  rename_map = {}
56
  for col in df.columns:
57
  if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
 
63
  if rename_map:
64
  df = df.rename(columns=rename_map)
65
 
66
+ # Combine duplicate columns if necessary
67
  if 'salary' in df.columns and list(df.columns).count('salary') > 1:
68
  salary_cols = [col for col in df.columns if col == 'salary']
69
  df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
 
73
  df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
74
  df = df.loc[:, ~df.columns.duplicated()]
75
 
76
+ # Construct employee_name if missing
77
  if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
78
  df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
79
 
80
+ # Ensure proper types for key columns
81
  if 'salary' in df.columns:
82
  df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
83
  if 'tin' in df.columns:
 
89
 
90
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
91
  """
92
+ Use Gemini AI to analyze DataFrame columns and suggest key columns and renames.
 
 
93
  """
94
  try:
95
  display_df = df.head(5).copy()
 
97
  display_df[col] = display_df[col].astype(str)
98
  sample_csv = display_df.to_csv(index=False)
99
  prompt = f"""
100
+ Analyze this CSV data, which may represent an employee earnings schedule, PAYE figures, or template info for payroll processing.
 
101
  Filename: {filename}
102
  Sample data (first 5 rows):
103
  {sample_csv}
104
+ Identify potential key columns for merging and suggest renames.
105
+ Respond with a valid JSON object.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  """
107
  response = model.generate_content(prompt)
108
  response_text = response.text.strip()
 
118
  st.error(f"JSON parsing error: {str(je)}")
119
  st.text("Raw response:")
120
  st.text(response_text)
121
+ return {"subject": "Error parsing analysis", "columns": [], "key_columns": [], "issues": ["Error analyzing columns"], "suggested_renames": {}}
 
 
 
 
 
 
122
  except Exception as e:
123
  st.error(f"Error in column analysis: {str(e)}")
124
+ return {"subject": "Error in analysis", "columns": [], "key_columns": [], "issues": [str(e)], "suggested_renames": {}}
 
 
 
 
 
 
125
 
126
  def read_excel_file(file) -> pd.DataFrame:
127
  """
128
  Read an Excel file with error handling.
 
129
  """
130
  try:
131
  return pd.read_excel(file, engine="openpyxl")
 
138
 
139
  def merge_with_master(processed_files):
140
  """
141
+ Merge DataFrames in two steps:
142
+ 1. Use the earnings file as master (dropping its inaccurate 'tin').
143
+ 2. Merge the template file (which supplies the trusted TIN via its first column)
144
+ with the earnings data using 'employee_name'.
145
+ 3. Finally, merge the combined data with the PAYE file using 'tin'.
 
 
146
  """
147
  earnings_file = None
148
  paye_file = None
149
  template_file = None
150
 
151
+ # Identify files by filename keywords
152
  for file_info in processed_files:
153
  lower_filename = file_info["filename"].lower()
154
  if "earnings" in lower_filename:
 
162
  st.warning("No earnings file found as master. Using the first file as master.")
163
  earnings_file = processed_files[0]
164
 
165
+ # Process earnings file: drop its inaccurate TIN column
166
  earnings_df = earnings_file["df"]
 
167
  if 'tin' in earnings_df.columns:
168
  earnings_df = earnings_df.drop(columns=['tin'])
 
169
  if 'middle_name' in earnings_df.columns:
170
  earnings_df = earnings_df.drop(columns=['middle_name'])
171
 
172
  merged_df = earnings_df.copy()
173
 
174
+ # Process and merge the template file using employee_name
175
  if template_file is not None:
176
  st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
177
  template_df = template_file["df"].copy()
178
+ # Force the first column (Personal ID of Employee) to be 'tin'
179
  if not template_df.empty:
180
  cols = list(template_df.columns)
181
  cols[0] = "tin"
182
  template_df.columns = cols
 
 
183
  if 'middle_name' in template_df.columns:
184
  template_df = template_df.drop(columns=['middle_name'])
185
+ # If employee_name is not present, construct it from first_name and last_name
186
  if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
187
  template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
 
 
188
  if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
189
  merged_df = merged_df.merge(template_df, on='employee_name', how='left', suffixes=('', '_template'))
190
  else:
191
+ st.warning("Column 'employee_name' missing in either earnings or template file. Skipping template merge.")
192
  else:
193
+ st.warning("No template file detected. Cannot proceed without a trusted TIN.")
194
 
195
+ # Check for a trusted 'tin' column after merging earnings and template
196
  if 'tin' not in merged_df.columns or merged_df['tin'].isnull().all():
197
  st.error("No trusted 'tin' column found in the merged earnings-template data. Aborting further merge. "
198
+ "Ensure the template file's first column (Personal ID of Employee) is correctly populated.")
199
  return merged_df
200
 
201
+ # Merge PAYE file using the trusted 'tin'
202
  if paye_file is not None:
203
  st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
204
  paye_df = paye_file["df"]
 
213
 
214
  def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
215
  """
216
+ Convert all entries in the DataFrame to strings and replace common null placeholders.
 
217
  """
218
  return df.astype(str).replace({"nan": "", "None": ""})
219
 
220
  def main():
221
  st.title("Smart CSV Processor")
222
  st.write("Upload CSV or Excel files for intelligent analysis and merging.")
223
+ uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"])
 
 
224
  if uploaded_files:
225
  st.write("### Processing Files")
226
  processed_files = []
 
235
  if df.empty:
236
  st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
237
  continue
 
238
  df = standardize_dataframe(df)
239
  st.write("Initial Preview:")
240
  st.dataframe(df.head())
 
243
  if analysis:
244
  st.write("Column Analysis:")
245
  st.json(analysis)
 
246
  if 'suggested_renames' in analysis:
247
  df = df.rename(columns=analysis['suggested_renames'])
248
+ processed_files.append({"filename": uploaded_file.name, "df": df, "analysis": analysis})
 
 
249
  else:
250
  st.error(f"Could not read data from '{uploaded_file.name}'.")
251
  except Exception as e:
 
261
  st.dataframe(safe_display_df(merged_df.head()))
262
  try:
263
  csv = merged_df.to_csv(index=False)
264
+ st.download_button(label="Download Merged CSV", data=csv, file_name="merged_data.csv", mime="text/csv")
 
 
 
 
 
265
  st.write("### Dataset Statistics")
266
  st.write(f"Total rows: {len(merged_df)}")
267
  st.write(f"Total columns: {len(merged_df.columns)}")