Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -69,33 +69,42 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 69 |
return df
|
| 70 |
|
| 71 |
def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
|
| 72 |
-
"""Analyze DataFrame columns using Gemini AI with improved error handling."""
|
| 73 |
try:
|
| 74 |
display_df = df.head(5).copy()
|
| 75 |
for col in display_df.columns:
|
| 76 |
display_df[col] = display_df[col].astype(str)
|
| 77 |
sample_csv = display_df.to_csv(index=False)
|
| 78 |
prompt = f"""
|
| 79 |
-
Analyze this CSV data and provide analysis in JSON format.
|
|
|
|
| 80 |
Filename: {filename}
|
| 81 |
-
Sample data:
|
| 82 |
{sample_csv}
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
{{
|
| 85 |
-
"subject": "Employee payroll data",
|
| 86 |
"columns": [
|
| 87 |
{{
|
| 88 |
"name": "column_name",
|
| 89 |
"type": "string/number/date",
|
| 90 |
-
"description": "Brief description"
|
| 91 |
}}
|
| 92 |
],
|
| 93 |
-
"key_columns": ["employee_id
|
| 94 |
-
"issues": ["
|
| 95 |
"suggested_renames": {{
|
| 96 |
"old_name": "new_name"
|
| 97 |
}}
|
| 98 |
}}
|
|
|
|
|
|
|
| 99 |
"""
|
| 100 |
response = model.generate_content(prompt)
|
| 101 |
response_text = response.text.strip()
|
|
@@ -156,12 +165,26 @@ def merge_with_master(processed_files):
|
|
| 156 |
st.warning("No master file with 'earnings' found. Using the first file as master.")
|
| 157 |
master_file = processed_files[0]
|
| 158 |
other_files = processed_files[1:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
master_df = master_file["df"]
|
| 160 |
master_keys = master_file["analysis"].get("key_columns", [])
|
| 161 |
st.write(f"Using '{master_file['filename']}' as master with key columns: {master_keys}")
|
| 162 |
merged_df = master_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
for other in other_files:
|
| 164 |
other_df = other["df"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
other_keys = other["analysis"].get("key_columns", [])
|
| 166 |
common_keys = list(set(master_keys).intersection(set(other_keys)))
|
| 167 |
if common_keys:
|
|
@@ -192,6 +215,9 @@ def main():
|
|
| 192 |
else:
|
| 193 |
df = pd.read_csv(uploaded_file)
|
| 194 |
if df is not None:
|
|
|
|
|
|
|
|
|
|
| 195 |
df.columns = [clean_column_name(col) for col in df.columns]
|
| 196 |
df = standardize_dataframe(df)
|
| 197 |
st.write("Initial Preview:")
|
|
@@ -207,39 +233,49 @@ def main():
|
|
| 207 |
processed_files.append(
|
| 208 |
{"filename": uploaded_file.name, "df": df, "analysis": analysis}
|
| 209 |
)
|
|
|
|
|
|
|
|
|
|
| 210 |
except Exception as e:
|
| 211 |
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
| 212 |
continue
|
|
|
|
| 213 |
if len(processed_files) > 1:
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
else:
|
| 244 |
st.warning("Please upload at least 2 files to merge.")
|
| 245 |
|
|
|
|
| 69 |
return df
|
| 70 |
|
| 71 |
def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
|
| 72 |
+
"""Analyze DataFrame columns using Gemini AI with improved error handling and prompt."""
|
| 73 |
try:
|
| 74 |
display_df = df.head(5).copy()
|
| 75 |
for col in display_df.columns:
|
| 76 |
display_df[col] = display_df[col].astype(str)
|
| 77 |
sample_csv = display_df.to_csv(index=False)
|
| 78 |
prompt = f"""
|
| 79 |
+
Analyze this CSV data, which represents employee payroll information, and provide analysis in JSON format.
|
| 80 |
+
|
| 81 |
Filename: {filename}
|
| 82 |
+
Sample data (first 5 rows):
|
| 83 |
{sample_csv}
|
| 84 |
+
|
| 85 |
+
In the context of merging datasets, "key columns" are columns that uniquely identify records and are essential for joining this data with other datasets. For payroll data, key columns are typically employee identifiers such as Employee ID, Taxpayer Identification Number (TIN), or Employee Name (if unique).
|
| 86 |
+
|
| 87 |
+
Please analyze the columns in the sample data and identify potential key columns that can be used to merge this dataset with other employee-related datasets.
|
| 88 |
+
|
| 89 |
+
Respond with ONLY a valid JSON object in the following format:
|
| 90 |
+
|
| 91 |
{{
|
| 92 |
+
"subject": "Employee payroll data analysis",
|
| 93 |
"columns": [
|
| 94 |
{{
|
| 95 |
"name": "column_name",
|
| 96 |
"type": "string/number/date",
|
| 97 |
+
"description": "Brief description of the column and its likely content."
|
| 98 |
}}
|
| 99 |
],
|
| 100 |
+
"key_columns": ["List of identified key column names. Prioritize employee identifiers like employee_id, tin, or employee_name if they appear to be unique identifiers."],
|
| 101 |
+
"issues": ["List any data quality issues found, like missing values in important columns."],
|
| 102 |
"suggested_renames": {{
|
| 103 |
"old_name": "new_name"
|
| 104 |
}}
|
| 105 |
}}
|
| 106 |
+
|
| 107 |
+
Ensure the JSON response is valid and parsable. Focus on accurately identifying key columns relevant for merging payroll data.
|
| 108 |
"""
|
| 109 |
response = model.generate_content(prompt)
|
| 110 |
response_text = response.text.strip()
|
|
|
|
| 165 |
st.warning("No master file with 'earnings' found. Using the first file as master.")
|
| 166 |
master_file = processed_files[0]
|
| 167 |
other_files = processed_files[1:]
|
| 168 |
+
|
| 169 |
+
if not master_file: # Handle case where no files are processed correctly
|
| 170 |
+
st.error("No master file could be determined. Merging cannot proceed.")
|
| 171 |
+
return None
|
| 172 |
+
|
| 173 |
master_df = master_file["df"]
|
| 174 |
master_keys = master_file["analysis"].get("key_columns", [])
|
| 175 |
st.write(f"Using '{master_file['filename']}' as master with key columns: {master_keys}")
|
| 176 |
merged_df = master_df
|
| 177 |
+
|
| 178 |
+
if merged_df.empty: # Check if master_df is empty. If so, no point merging.
|
| 179 |
+
st.warning(f"Master DataFrame '{master_file['filename']}' is empty. Merging will result in an empty DataFrame.")
|
| 180 |
+
return merged_df
|
| 181 |
+
|
| 182 |
for other in other_files:
|
| 183 |
other_df = other["df"]
|
| 184 |
+
if other_df.empty: # Check if other_df is empty before merging
|
| 185 |
+
st.warning(f"DataFrame '{other['filename']}' is empty. Skipping merge for this file.")
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
other_keys = other["analysis"].get("key_columns", [])
|
| 189 |
common_keys = list(set(master_keys).intersection(set(other_keys)))
|
| 190 |
if common_keys:
|
|
|
|
| 215 |
else:
|
| 216 |
df = pd.read_csv(uploaded_file)
|
| 217 |
if df is not None:
|
| 218 |
+
if df.empty: # Check if dataframe is empty immediately after reading.
|
| 219 |
+
st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
|
| 220 |
+
continue # Skip to next file
|
| 221 |
df.columns = [clean_column_name(col) for col in df.columns]
|
| 222 |
df = standardize_dataframe(df)
|
| 223 |
st.write("Initial Preview:")
|
|
|
|
| 233 |
processed_files.append(
|
| 234 |
{"filename": uploaded_file.name, "df": df, "analysis": analysis}
|
| 235 |
)
|
| 236 |
+
else:
|
| 237 |
+
st.error(f"Could not read data from '{uploaded_file.name}'.") # Explicit error if read_excel_file returns None
|
| 238 |
+
|
| 239 |
except Exception as e:
|
| 240 |
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
| 241 |
continue
|
| 242 |
+
|
| 243 |
if len(processed_files) > 1:
|
| 244 |
+
if not any(file_info["df"].empty for file_info in processed_files): # Check if any processed dataframe is empty before merging
|
| 245 |
+
st.write("### Merging DataFrames with Earnings Schedule as Master")
|
| 246 |
+
merged_df = merge_with_master(processed_files)
|
| 247 |
+
if merged_df is not None and not merged_df.empty: # Check merged_df is not None and not empty before displaying.
|
| 248 |
+
st.write("### Preview of Merged Data")
|
| 249 |
+
st.dataframe(safe_display_df(merged_df.head()))
|
| 250 |
+
try:
|
| 251 |
+
csv = merged_df.to_csv(index=False)
|
| 252 |
+
st.download_button(
|
| 253 |
+
label="Download Merged CSV",
|
| 254 |
+
data=csv,
|
| 255 |
+
file_name="merged_data.csv",
|
| 256 |
+
mime="text/csv",
|
| 257 |
+
)
|
| 258 |
+
st.write("### Dataset Statistics")
|
| 259 |
+
st.write(f"Total rows: {len(merged_df)}")
|
| 260 |
+
st.write(f"Total columns: {len(merged_df.columns)}")
|
| 261 |
+
st.write("### Data Quality Metrics")
|
| 262 |
+
missing_df = pd.DataFrame(
|
| 263 |
+
{
|
| 264 |
+
"Column": merged_df.columns,
|
| 265 |
+
"Missing Values": merged_df.isnull().sum().values,
|
| 266 |
+
"Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
|
| 267 |
+
}
|
| 268 |
+
)
|
| 269 |
+
st.dataframe(missing_df)
|
| 270 |
+
duplicates = merged_df.duplicated().sum()
|
| 271 |
+
st.write(f"Number of duplicate rows: {duplicates}")
|
| 272 |
+
except Exception as e:
|
| 273 |
+
st.error(f"Error preparing download: {str(e)}")
|
| 274 |
+
elif merged_df is not None and merged_df.empty: # Explicitly handle empty merged dataframe case
|
| 275 |
+
st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
|
| 276 |
+
else:
|
| 277 |
+
st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
|
| 278 |
+
|
| 279 |
else:
|
| 280 |
st.warning("Please upload at least 2 files to merge.")
|
| 281 |
|