Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 21, 2025

Commit

c71c6c1

verified ·

1 Parent(s): 1f19aba

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -53

app.py CHANGED Viewed

@@ -69,22 +69,22 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     return df
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
-    """Analyze DataFrame columns using Gemini AI with improved error handling and prompt."""
     try:
         display_df = df.head(5).copy()
         for col in display_df.columns:
             display_df[col] = display_df[col].astype(str)
         sample_csv = display_df.to_csv(index=False)
         prompt = f"""
-        Analyze this CSV data, which represents employee payroll information, and provide analysis in JSON format.
         Filename: {filename}
         Sample data (first 5 rows):
         {sample_csv}
-        In the context of merging datasets, "key columns" are columns that uniquely identify records and are essential for joining this data with other datasets. For payroll data, key columns are typically employee identifiers such as Employee ID, Taxpayer Identification Number (TIN), or Employee Name (if unique).
-        Please analyze the columns in the sample data and identify potential key columns that can be used to merge this dataset with other employee-related datasets.
         Respond with ONLY a valid JSON object in the following format:
@@ -97,14 +97,14 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
                     "description": "Brief description of the column and its likely content."
                 }}
             ],
-            "key_columns": ["List of identified key column names.  Prioritize employee identifiers like employee_id, tin, or employee_name if they appear to be unique identifiers."],
             "issues": ["List any data quality issues found, like missing values in important columns."],
             "suggested_renames": {{
                 "old_name": "new_name"
             }}
         }}
-        Ensure the JSON response is valid and parsable. Focus on accurately identifying key columns relevant for merging payroll data.
         """
         response = model.generate_content(prompt)
         response_text = response.text.strip()
@@ -150,49 +150,48 @@ def read_excel_file(file) -> pd.DataFrame:
 def merge_with_master(processed_files):
     """
-    Merge multiple DataFrames using the earnings schedule file as the master.
-    The master file is identified by having 'earnings' in its filename (case insensitive).
-    Other files are merged onto the master using key columns identified by AI analysis.
     """
     master_file = None
-    other_files = []
     for file_info in processed_files:
-        if "earnings" in file_info["filename"].lower():
             master_file = file_info
-        else:
-            other_files.append(file_info)
     if not master_file:
-        st.warning("No master file with 'earnings' found. Using the first file as master.")
         master_file = processed_files[0]
-        other_files = processed_files[1:]
-    if not master_file: # Handle case where no files are processed correctly
-        st.error("No master file could be determined. Merging cannot proceed.")
-        return None
     master_df = master_file["df"]
-    if master_df.empty: # Check if master_df is empty. If so, no point merging.
-        st.warning(f"Master DataFrame '{master_file['filename']}' is empty. Merging will result in an empty DataFrame.")
-        return pd.DataFrame() # Return empty DataFrame
-    master_keys = master_file["analysis"].get("key_columns", [])
-    st.write(f"Using '{master_file['filename']}' as master with key columns: {master_keys}")
-    merged_df = master_df
-    for other in other_files:
-        other_df = other["df"]
-        if other_df.empty: # Check if other_df is empty before merging
-            st.warning(f"DataFrame '{other['filename']}' is empty. Skipping merge for this file.")
-            continue
-        other_keys = other["analysis"].get("key_columns", [])
-        common_keys = list(set(master_keys).intersection(set(other_keys)))
-        if common_keys:
-            st.write(f"Merging '{other['filename']}' on keys: {common_keys}")
-            merged_df = merged_df.merge(other_df, on=common_keys, how="left")
         else:
-            st.warning(f"No common keys found for merging '{other['filename']}'. Skipping this file.")
     return merged_df
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
@@ -216,9 +215,9 @@ def main():
                 else:
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
-                    if df.empty: # Check if dataframe is empty immediately after reading.
                         st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
-                        continue # Skip to next file
                     df.columns = [clean_column_name(col) for col in df.columns]
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
@@ -228,24 +227,23 @@ def main():
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
-                        # Apply suggested renames
                         if 'suggested_renames' in analysis:
                             df = df.rename(columns=analysis['suggested_renames'])
                         processed_files.append(
                             {"filename": uploaded_file.name, "df": df, "analysis": analysis}
                         )
                 else:
-                    st.error(f"Could not read data from '{uploaded_file.name}'.") # Explicit error if read_excel_file returns None
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                 continue
         if len(processed_files) > 1:
-            if not any(file_info["df"].empty for file_info in processed_files): # Check if any processed dataframe is empty before merging
-                st.write("### Merging DataFrames with Earnings Schedule as Master")
                 merged_df = merge_with_master(processed_files)
-                if merged_df is not None and not merged_df.empty: # Check merged_df is not None and not empty before displaying.
                     st.write("### Preview of Merged Data")
                     st.dataframe(safe_display_df(merged_df.head()))
                     try:
@@ -265,18 +263,17 @@ def main():
                                 "Column": merged_df.columns,
                                 "Missing Values": merged_df.isnull().sum().values,
                                 "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
-                                }
-                            )
                         st.dataframe(missing_df)
                         duplicates = merged_df.duplicated().sum()
                         st.write(f"Number of duplicate rows: {duplicates}")
                     except Exception as e:
                         st.error(f"Error preparing download: {str(e)}")
-                elif merged_df is not None and merged_df.empty: # Explicitly handle empty merged dataframe case
                     st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
             else:
                 st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
         else:
             st.warning("Please upload at least 2 files to merge.")

     return df
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
+    """Analyze DataFrame columns using Gemini AI with an updated prompt."""
     try:
         display_df = df.head(5).copy()
         for col in display_df.columns:
             display_df[col] = display_df[col].astype(str)
         sample_csv = display_df.to_csv(index=False)
         prompt = f"""
+        Analyze this CSV data, which may represent an employee earnings schedule, PAYE figures, or template info containing additional details for payroll processing. Provide an analysis in JSON format.
         Filename: {filename}
         Sample data (first 5 rows):
         {sample_csv}
+        For merging these datasets, key columns are essential. The earnings schedule is considered the master file, and PAYE figures and template info should be merged into it using common identifiers such as Tax Identification Number (TIN), Employee ID, or Employee Name if unique.
+        Please analyze the columns in the sample data and identify potential key columns for merging. Also, report any data quality issues and suggest renames to standardize the column names.
         Respond with ONLY a valid JSON object in the following format:
                     "description": "Brief description of the column and its likely content."
                 }}
             ],
+            "key_columns": ["List of identified key column names. Prioritize employee identifiers like employee_id, tin, or employee_name."],
             "issues": ["List any data quality issues found, like missing values in important columns."],
             "suggested_renames": {{
                 "old_name": "new_name"
             }}
         }}
+        Ensure the JSON response is valid and parsable.
         """
         response = model.generate_content(prompt)
         response_text = response.text.strip()
 def merge_with_master(processed_files):
     """
+    Merge multiple DataFrames by using the earnings schedule file as the master.
+    This modified logic looks for files whose names include 'earnings', 'paye', or 'template'.
+    The PAYE figures and the template info are merged onto the earnings sheet using the 'tin' key.
     """
     master_file = None
+    paye_file = None
+    template_file = None
     for file_info in processed_files:
+        lower_filename = file_info["filename"].lower()
+        if "earnings" in lower_filename:
             master_file = file_info
+        elif "paye" in lower_filename:
+            paye_file = file_info
+        elif "template" in lower_filename:
+            template_file = file_info
     if not master_file:
+        st.warning("No earnings file found as master. Using the first file as master.")
         master_file = processed_files[0]
     master_df = master_file["df"]
+    merged_df = master_df.copy()
+    # Merge PAYE figures onto the earnings sheet
+    if paye_file is not None:
+        st.write(f"Merging PAYE figures from '{paye_file['filename']}' onto the earnings sheet using key 'tin'.")
+        if 'tin' in merged_df.columns and 'tin' in paye_file["df"].columns:
+            merged_df = merged_df.merge(paye_file["df"], on='tin', how='left', suffixes=('', '_paye'))
+        else:
+            st.warning("Column 'tin' missing in either the earnings or PAYE file. Skipping PAYE merge.")
+    else:
+        st.warning("No PAYE file detected.")
+    # Merge template info onto the earnings sheet
+    if template_file is not None:
+        st.write(f"Merging template info from '{template_file['filename']}' onto the earnings sheet using key 'tin'.")
+        if 'tin' in merged_df.columns and 'tin' in template_file["df"].columns:
+            merged_df = merged_df.merge(template_file["df"], on='tin', how='left', suffixes=('', '_template'))
         else:
+            st.warning("Column 'tin' missing in either the earnings or template file. Skipping template merge.")
+    else:
+        st.warning("No template file detected.")
     return merged_df
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
                 else:
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
+                    if df.empty:
                         st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
+                        continue
                     df.columns = [clean_column_name(col) for col in df.columns]
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
+                        # Apply suggested renames if provided
                         if 'suggested_renames' in analysis:
                             df = df.rename(columns=analysis['suggested_renames'])
                         processed_files.append(
                             {"filename": uploaded_file.name, "df": df, "analysis": analysis}
                         )
                 else:
+                    st.error(f"Could not read data from '{uploaded_file.name}'.")
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                 continue
         if len(processed_files) > 1:
+            if not any(file_info["df"].empty for file_info in processed_files):
+                st.write("### Merging DataFrames (Earnings as Master)")
                 merged_df = merge_with_master(processed_files)
+                if merged_df is not None and not merged_df.empty:
                     st.write("### Preview of Merged Data")
                     st.dataframe(safe_display_df(merged_df.head()))
                     try:
                                 "Column": merged_df.columns,
                                 "Missing Values": merged_df.isnull().sum().values,
                                 "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
+                            }
+                        )
                         st.dataframe(missing_df)
                         duplicates = merged_df.duplicated().sum()
                         st.write(f"Number of duplicate rows: {duplicates}")
                     except Exception as e:
                         st.error(f"Error preparing download: {str(e)}")
+                elif merged_df is not None and merged_df.empty:
                     st.warning("The merged DataFrame is empty. Please check the input files and merging keys.")
             else:
                 st.warning("One or more of the processed DataFrames is empty. Merging cannot proceed meaningfully.")
         else:
             st.warning("Please upload at least 2 files to merge.")