Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -69,22 +69,22 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 69 |
return df
|
| 70 |
|
| 71 |
def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
|
| 72 |
-
"""Analyze DataFrame columns using Gemini AI with
|
| 73 |
try:
|
| 74 |
display_df = df.head(5).copy()
|
| 75 |
for col in display_df.columns:
|
| 76 |
display_df[col] = display_df[col].astype(str)
|
| 77 |
sample_csv = display_df.to_csv(index=False)
|
| 78 |
prompt = f"""
|
| 79 |
-
Analyze this CSV data, which
|
| 80 |
|
| 81 |
Filename: {filename}
|
| 82 |
Sample data (first 5 rows):
|
| 83 |
{sample_csv}
|
| 84 |
|
| 85 |
-
|
| 86 |
|
| 87 |
-
Please analyze the columns in the sample data and identify potential key columns
|
| 88 |
|
| 89 |
Respond with ONLY a valid JSON object in the following format:
|
| 90 |
|
|
@@ -97,14 +97,14 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
|
|
| 97 |
"description": "Brief description of the column and its likely content."
|
| 98 |
}}
|
| 99 |
],
|
| 100 |
-
"key_columns": ["List of identified key column names.
|
| 101 |
"issues": ["List any data quality issues found, like missing values in important columns."],
|
| 102 |
"suggested_renames": {{
|
| 103 |
"old_name": "new_name"
|
| 104 |
}}
|
| 105 |
}}
|
| 106 |
|
| 107 |
-
Ensure the JSON response is valid and parsable.
|
| 108 |
"""
|
| 109 |
response = model.generate_content(prompt)
|
| 110 |
response_text = response.text.strip()
|
|
@@ -150,49 +150,48 @@ def read_excel_file(file) -> pd.DataFrame:
|
|
| 150 |
|
| 151 |
def merge_with_master(processed_files):
|
| 152 |
"""
|
| 153 |
-
Merge multiple DataFrames using the earnings schedule file as the master.
|
| 154 |
-
|
| 155 |
-
|
| 156 |
"""
|
| 157 |
master_file = None
|
| 158 |
-
|
|
|
|
| 159 |
for file_info in processed_files:
|
| 160 |
-
|
|
|
|
| 161 |
master_file = file_info
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
| 164 |
if not master_file:
|
| 165 |
-
st.warning("No
|
| 166 |
master_file = processed_files[0]
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
if not master_file: # Handle case where no files are processed correctly
|
| 170 |
-
st.error("No master file could be determined. Merging cannot proceed.")
|
| 171 |
-
return None
|
| 172 |
-
|
| 173 |
master_df = master_file["df"]
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
if common_keys:
|
| 192 |
-
st.write(f"Merging '{other['filename']}' on keys: {common_keys}")
|
| 193 |
-
merged_df = merged_df.merge(other_df, on=common_keys, how="left")
|
| 194 |
else:
|
| 195 |
-
st.warning(
|
|
|
|
|
|
|
|
|
|
| 196 |
return merged_df
|
| 197 |
|
| 198 |
def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -216,9 +215,9 @@ def main():
|
|
| 216 |
else:
|
| 217 |
df = pd.read_csv(uploaded_file)
|
| 218 |
if df is not None:
|
| 219 |
-
if df.empty:
|
| 220 |
st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
|
| 221 |
-
continue
|
| 222 |
df.columns = [clean_column_name(col) for col in df.columns]
|
| 223 |
df = standardize_dataframe(df)
|
| 224 |
st.write("Initial Preview:")
|
|
@@ -228,24 +227,23 @@ def main():
|
|
| 228 |
if analysis:
|
| 229 |
st.write("Column Analysis:")
|
| 230 |
st.json(analysis)
|
| 231 |
-
# Apply suggested renames
|
| 232 |
if 'suggested_renames' in analysis:
|
| 233 |
df = df.rename(columns=analysis['suggested_renames'])
|
| 234 |
processed_files.append(
|
| 235 |
{"filename": uploaded_file.name, "df": df, "analysis": analysis}
|
| 236 |
)
|
| 237 |
else:
|
| 238 |
-
st.error(f"Could not read data from '{uploaded_file.name}'.")
|
| 239 |
-
|
| 240 |
except Exception as e:
|
| 241 |
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
| 242 |
continue
|
| 243 |
|
| 244 |
if len(processed_files) > 1:
|
| 245 |
-
if not any(file_info["df"].empty for file_info in processed_files):
|
| 246 |
-
st.write("### Merging DataFrames
|
| 247 |
merged_df = merge_with_master(processed_files)
|
| 248 |
-
if merged_df is not None and not merged_df.empty:
|
| 249 |
st.write("### Preview of Merged Data")
|
| 250 |
st.dataframe(safe_display_df(merged_df.head()))
|
| 251 |
try:
|
|
@@ -265,18 +263,17 @@ def main():
|
|
| 265 |
"Column": merged_df.columns,
|
| 266 |
"Missing Values": merged_df.isnull().sum().values,
|
| 267 |
"Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
|
| 268 |
-
|
| 269 |
-
|
| 270 |
st.dataframe(missing_df)
|
| 271 |
duplicates = merged_df.duplicated().sum()
|
| 272 |
st.write(f"Number of duplicate rows: {duplicates}")
|
| 273 |
except Exception as e:
|
| 274 |
st.error(f"Error preparing download: {str(e)}")
|
| 275 |
-
elif merged_df is not None and merged_df.empty:
|
| 276 |
st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
|
| 277 |
else:
|
| 278 |
st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
|
| 279 |
-
|
| 280 |
else:
|
| 281 |
st.warning("Please upload at least 2 files to merge.")
|
| 282 |
|
|
|
|
| 69 |
return df
|
| 70 |
|
| 71 |
def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
|
| 72 |
+
"""Analyze DataFrame columns using Gemini AI with an updated prompt."""
|
| 73 |
try:
|
| 74 |
display_df = df.head(5).copy()
|
| 75 |
for col in display_df.columns:
|
| 76 |
display_df[col] = display_df[col].astype(str)
|
| 77 |
sample_csv = display_df.to_csv(index=False)
|
| 78 |
prompt = f"""
|
| 79 |
+
Analyze this CSV data, which may represent an employee earnings schedule, PAYE figures, or template info containing additional details for payroll processing. Provide an analysis in JSON format.
|
| 80 |
|
| 81 |
Filename: {filename}
|
| 82 |
Sample data (first 5 rows):
|
| 83 |
{sample_csv}
|
| 84 |
|
| 85 |
+
For merging these datasets, key columns are essential. The earnings schedule is considered the master file, and PAYE figures and template info should be merged into it using common identifiers such as Tax Identification Number (TIN), Employee ID, or Employee Name if unique.
|
| 86 |
|
| 87 |
+
Please analyze the columns in the sample data and identify potential key columns for merging. Also, report any data quality issues and suggest renames to standardize the column names.
|
| 88 |
|
| 89 |
Respond with ONLY a valid JSON object in the following format:
|
| 90 |
|
|
|
|
| 97 |
"description": "Brief description of the column and its likely content."
|
| 98 |
}}
|
| 99 |
],
|
| 100 |
+
"key_columns": ["List of identified key column names. Prioritize employee identifiers like employee_id, tin, or employee_name."],
|
| 101 |
"issues": ["List any data quality issues found, like missing values in important columns."],
|
| 102 |
"suggested_renames": {{
|
| 103 |
"old_name": "new_name"
|
| 104 |
}}
|
| 105 |
}}
|
| 106 |
|
| 107 |
+
Ensure the JSON response is valid and parsable.
|
| 108 |
"""
|
| 109 |
response = model.generate_content(prompt)
|
| 110 |
response_text = response.text.strip()
|
|
|
|
| 150 |
|
| 151 |
def merge_with_master(processed_files):
|
| 152 |
"""
|
| 153 |
+
Merge multiple DataFrames by using the earnings schedule file as the master.
|
| 154 |
+
This modified logic looks for files whose names include 'earnings', 'paye', or 'template'.
|
| 155 |
+
The PAYE figures and the template info are merged onto the earnings sheet using the 'tin' key.
|
| 156 |
"""
|
| 157 |
master_file = None
|
| 158 |
+
paye_file = None
|
| 159 |
+
template_file = None
|
| 160 |
for file_info in processed_files:
|
| 161 |
+
lower_filename = file_info["filename"].lower()
|
| 162 |
+
if "earnings" in lower_filename:
|
| 163 |
master_file = file_info
|
| 164 |
+
elif "paye" in lower_filename:
|
| 165 |
+
paye_file = file_info
|
| 166 |
+
elif "template" in lower_filename:
|
| 167 |
+
template_file = file_info
|
| 168 |
if not master_file:
|
| 169 |
+
st.warning("No earnings file found as master. Using the first file as master.")
|
| 170 |
master_file = processed_files[0]
|
| 171 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
master_df = master_file["df"]
|
| 173 |
+
merged_df = master_df.copy()
|
| 174 |
+
|
| 175 |
+
# Merge PAYE figures onto the earnings sheet
|
| 176 |
+
if paye_file is not None:
|
| 177 |
+
st.write(f"Merging PAYE figures from '{paye_file['filename']}' onto the earnings sheet using key 'tin'.")
|
| 178 |
+
if 'tin' in merged_df.columns and 'tin' in paye_file["df"].columns:
|
| 179 |
+
merged_df = merged_df.merge(paye_file["df"], on='tin', how='left', suffixes=('', '_paye'))
|
| 180 |
+
else:
|
| 181 |
+
st.warning("Column 'tin' missing in either the earnings or PAYE file. Skipping PAYE merge.")
|
| 182 |
+
else:
|
| 183 |
+
st.warning("No PAYE file detected.")
|
| 184 |
+
|
| 185 |
+
# Merge template info onto the earnings sheet
|
| 186 |
+
if template_file is not None:
|
| 187 |
+
st.write(f"Merging template info from '{template_file['filename']}' onto the earnings sheet using key 'tin'.")
|
| 188 |
+
if 'tin' in merged_df.columns and 'tin' in template_file["df"].columns:
|
| 189 |
+
merged_df = merged_df.merge(template_file["df"], on='tin', how='left', suffixes=('', '_template'))
|
|
|
|
|
|
|
|
|
|
| 190 |
else:
|
| 191 |
+
st.warning("Column 'tin' missing in either the earnings or template file. Skipping template merge.")
|
| 192 |
+
else:
|
| 193 |
+
st.warning("No template file detected.")
|
| 194 |
+
|
| 195 |
return merged_df
|
| 196 |
|
| 197 |
def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
| 215 |
else:
|
| 216 |
df = pd.read_csv(uploaded_file)
|
| 217 |
if df is not None:
|
| 218 |
+
if df.empty:
|
| 219 |
st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
|
| 220 |
+
continue
|
| 221 |
df.columns = [clean_column_name(col) for col in df.columns]
|
| 222 |
df = standardize_dataframe(df)
|
| 223 |
st.write("Initial Preview:")
|
|
|
|
| 227 |
if analysis:
|
| 228 |
st.write("Column Analysis:")
|
| 229 |
st.json(analysis)
|
| 230 |
+
# Apply suggested renames if provided
|
| 231 |
if 'suggested_renames' in analysis:
|
| 232 |
df = df.rename(columns=analysis['suggested_renames'])
|
| 233 |
processed_files.append(
|
| 234 |
{"filename": uploaded_file.name, "df": df, "analysis": analysis}
|
| 235 |
)
|
| 236 |
else:
|
| 237 |
+
st.error(f"Could not read data from '{uploaded_file.name}'.")
|
|
|
|
| 238 |
except Exception as e:
|
| 239 |
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
| 240 |
continue
|
| 241 |
|
| 242 |
if len(processed_files) > 1:
|
| 243 |
+
if not any(file_info["df"].empty for file_info in processed_files):
|
| 244 |
+
st.write("### Merging DataFrames (Earnings as Master)")
|
| 245 |
merged_df = merge_with_master(processed_files)
|
| 246 |
+
if merged_df is not None and not merged_df.empty:
|
| 247 |
st.write("### Preview of Merged Data")
|
| 248 |
st.dataframe(safe_display_df(merged_df.head()))
|
| 249 |
try:
|
|
|
|
| 263 |
"Column": merged_df.columns,
|
| 264 |
"Missing Values": merged_df.isnull().sum().values,
|
| 265 |
"Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
|
| 266 |
+
}
|
| 267 |
+
)
|
| 268 |
st.dataframe(missing_df)
|
| 269 |
duplicates = merged_df.duplicated().sum()
|
| 270 |
st.write(f"Number of duplicate rows: {duplicates}")
|
| 271 |
except Exception as e:
|
| 272 |
st.error(f"Error preparing download: {str(e)}")
|
| 273 |
+
elif merged_df is not None and merged_df.empty:
|
| 274 |
st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
|
| 275 |
else:
|
| 276 |
st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
|
|
|
|
| 277 |
else:
|
| 278 |
st.warning("Please upload at least 2 files to merge.")
|
| 279 |
|