rairo commited on
Commit
320193a
·
verified ·
1 Parent(s): 3bfe933

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -37
app.py CHANGED
@@ -69,33 +69,42 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
69
  return df
70
 
71
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
72
- """Analyze DataFrame columns using Gemini AI with improved error handling."""
73
  try:
74
  display_df = df.head(5).copy()
75
  for col in display_df.columns:
76
  display_df[col] = display_df[col].astype(str)
77
  sample_csv = display_df.to_csv(index=False)
78
  prompt = f"""
79
- Analyze this CSV data and provide analysis in JSON format.
 
80
  Filename: {filename}
81
- Sample data:
82
  {sample_csv}
83
- Respond with only a valid JSON object in this format:
 
 
 
 
 
 
84
  {{
85
- "subject": "Employee payroll data",
86
  "columns": [
87
  {{
88
  "name": "column_name",
89
  "type": "string/number/date",
90
- "description": "Brief description"
91
  }}
92
  ],
93
- "key_columns": ["employee_id", "tin"],
94
- "issues": ["Missing values in salary column"],
95
  "suggested_renames": {{
96
  "old_name": "new_name"
97
  }}
98
  }}
 
 
99
  """
100
  response = model.generate_content(prompt)
101
  response_text = response.text.strip()
@@ -156,12 +165,26 @@ def merge_with_master(processed_files):
156
  st.warning("No master file with 'earnings' found. Using the first file as master.")
157
  master_file = processed_files[0]
158
  other_files = processed_files[1:]
 
 
 
 
 
159
  master_df = master_file["df"]
160
  master_keys = master_file["analysis"].get("key_columns", [])
161
  st.write(f"Using '{master_file['filename']}' as master with key columns: {master_keys}")
162
  merged_df = master_df
 
 
 
 
 
163
  for other in other_files:
164
  other_df = other["df"]
 
 
 
 
165
  other_keys = other["analysis"].get("key_columns", [])
166
  common_keys = list(set(master_keys).intersection(set(other_keys)))
167
  if common_keys:
@@ -192,6 +215,9 @@ def main():
192
  else:
193
  df = pd.read_csv(uploaded_file)
194
  if df is not None:
 
 
 
195
  df.columns = [clean_column_name(col) for col in df.columns]
196
  df = standardize_dataframe(df)
197
  st.write("Initial Preview:")
@@ -207,39 +233,49 @@ def main():
207
  processed_files.append(
208
  {"filename": uploaded_file.name, "df": df, "analysis": analysis}
209
  )
 
 
 
210
  except Exception as e:
211
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
212
  continue
 
213
  if len(processed_files) > 1:
214
- st.write("### Merging DataFrames with Earnings Schedule as Master")
215
- merged_df = merge_with_master(processed_files)
216
- if merged_df is not None:
217
- st.write("### Preview of Merged Data")
218
- st.dataframe(safe_display_df(merged_df.head()))
219
- try:
220
- csv = merged_df.to_csv(index=False)
221
- st.download_button(
222
- label="Download Merged CSV",
223
- data=csv,
224
- file_name="merged_data.csv",
225
- mime="text/csv",
226
- )
227
- st.write("### Dataset Statistics")
228
- st.write(f"Total rows: {len(merged_df)}")
229
- st.write(f"Total columns: {len(merged_df.columns)}")
230
- st.write("### Data Quality Metrics")
231
- missing_df = pd.DataFrame(
232
- {
233
- "Column": merged_df.columns,
234
- "Missing Values": merged_df.isnull().sum().values,
235
- "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
236
- }
237
- )
238
- st.dataframe(missing_df)
239
- duplicates = merged_df.duplicated().sum()
240
- st.write(f"Number of duplicate rows: {duplicates}")
241
- except Exception as e:
242
- st.error(f"Error preparing download: {str(e)}")
 
 
 
 
 
 
243
  else:
244
  st.warning("Please upload at least 2 files to merge.")
245
 
 
69
  return df
70
 
71
  def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
72
+ """Analyze DataFrame columns using Gemini AI with improved error handling and prompt."""
73
  try:
74
  display_df = df.head(5).copy()
75
  for col in display_df.columns:
76
  display_df[col] = display_df[col].astype(str)
77
  sample_csv = display_df.to_csv(index=False)
78
  prompt = f"""
79
+ Analyze this CSV data, which represents employee payroll information, and provide analysis in JSON format.
80
+
81
  Filename: {filename}
82
+ Sample data (first 5 rows):
83
  {sample_csv}
84
+
85
+ In the context of merging datasets, "key columns" are columns that uniquely identify records and are essential for joining this data with other datasets. For payroll data, key columns are typically employee identifiers such as Employee ID, Taxpayer Identification Number (TIN), or Employee Name (if unique).
86
+
87
+ Please analyze the columns in the sample data and identify potential key columns that can be used to merge this dataset with other employee-related datasets.
88
+
89
+ Respond with ONLY a valid JSON object in the following format:
90
+
91
  {{
92
+ "subject": "Employee payroll data analysis",
93
  "columns": [
94
  {{
95
  "name": "column_name",
96
  "type": "string/number/date",
97
+ "description": "Brief description of the column and its likely content."
98
  }}
99
  ],
100
+ "key_columns": ["List of identified key column names. Prioritize employee identifiers like employee_id, tin, or employee_name if they appear to be unique identifiers."],
101
+ "issues": ["List any data quality issues found, like missing values in important columns."],
102
  "suggested_renames": {{
103
  "old_name": "new_name"
104
  }}
105
  }}
106
+
107
+ Ensure the JSON response is valid and parsable. Focus on accurately identifying key columns relevant for merging payroll data.
108
  """
109
  response = model.generate_content(prompt)
110
  response_text = response.text.strip()
 
165
  st.warning("No master file with 'earnings' found. Using the first file as master.")
166
  master_file = processed_files[0]
167
  other_files = processed_files[1:]
168
+
169
+ if not master_file: # Handle case where no files are processed correctly
170
+ st.error("No master file could be determined. Merging cannot proceed.")
171
+ return None
172
+
173
  master_df = master_file["df"]
174
  master_keys = master_file["analysis"].get("key_columns", [])
175
  st.write(f"Using '{master_file['filename']}' as master with key columns: {master_keys}")
176
  merged_df = master_df
177
+
178
+ if merged_df.empty: # Check if master_df is empty. If so, no point merging.
179
+ st.warning(f"Master DataFrame '{master_file['filename']}' is empty. Merging will result in an empty DataFrame.")
180
+ return merged_df
181
+
182
  for other in other_files:
183
  other_df = other["df"]
184
+ if other_df.empty: # Check if other_df is empty before merging
185
+ st.warning(f"DataFrame '{other['filename']}' is empty. Skipping merge for this file.")
186
+ continue
187
+
188
  other_keys = other["analysis"].get("key_columns", [])
189
  common_keys = list(set(master_keys).intersection(set(other_keys)))
190
  if common_keys:
 
215
  else:
216
  df = pd.read_csv(uploaded_file)
217
  if df is not None:
218
+ if df.empty: # Check if dataframe is empty immediately after reading.
219
+ st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
220
+ continue # Skip to next file
221
  df.columns = [clean_column_name(col) for col in df.columns]
222
  df = standardize_dataframe(df)
223
  st.write("Initial Preview:")
 
233
  processed_files.append(
234
  {"filename": uploaded_file.name, "df": df, "analysis": analysis}
235
  )
236
+ else:
237
+ st.error(f"Could not read data from '{uploaded_file.name}'.") # Explicit error if read_excel_file returns None
238
+
239
  except Exception as e:
240
  st.error(f"Error processing {uploaded_file.name}: {str(e)}")
241
  continue
242
+
243
  if len(processed_files) > 1:
244
+ if not any(file_info["df"].empty for file_info in processed_files): # Check if any processed dataframe is empty before merging
245
+ st.write("### Merging DataFrames with Earnings Schedule as Master")
246
+ merged_df = merge_with_master(processed_files)
247
+ if merged_df is not None and not merged_df.empty: # Check merged_df is not None and not empty before displaying.
248
+ st.write("### Preview of Merged Data")
249
+ st.dataframe(safe_display_df(merged_df.head()))
250
+ try:
251
+ csv = merged_df.to_csv(index=False)
252
+ st.download_button(
253
+ label="Download Merged CSV",
254
+ data=csv,
255
+ file_name="merged_data.csv",
256
+ mime="text/csv",
257
+ )
258
+ st.write("### Dataset Statistics")
259
+ st.write(f"Total rows: {len(merged_df)}")
260
+ st.write(f"Total columns: {len(merged_df.columns)}")
261
+ st.write("### Data Quality Metrics")
262
+ missing_df = pd.DataFrame(
263
+ {
264
+ "Column": merged_df.columns,
265
+ "Missing Values": merged_df.isnull().sum().values,
266
+ "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
267
+ }
268
+ )
269
+ st.dataframe(missing_df)
270
+ duplicates = merged_df.duplicated().sum()
271
+ st.write(f"Number of duplicate rows: {duplicates}")
272
+ except Exception as e:
273
+ st.error(f"Error preparing download: {str(e)}")
274
+ elif merged_df is not None and merged_df.empty: # Explicitly handle empty merged dataframe case
275
+ st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
276
+ else:
277
+ st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
278
+
279
  else:
280
  st.warning("Please upload at least 2 files to merge.")
281