Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 24, 2025

Commit

8018c05

verified ·

1 Parent(s): d2c0f12

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -98

app.py CHANGED Viewed

@@ -14,8 +14,7 @@ model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
     """
-    Clean column names to be compatible with Arrow.
-    Converts to lowercase and replaces non-alphanumeric characters with underscores.
     """
     if not isinstance(col_name, str):
         return str(col_name)
@@ -38,23 +37,21 @@ def clean_tin_value(val):
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Standardize DataFrame column names and data types:
-    - Drops any middle name columns.
-    - Cleans all column names (e.g., "Employee Name" becomes "employee_name").
-    - Renames synonyms to common names (e.g., mapping TIN-related columns to 'tin'
-      and salary-related columns to 'salary').
-    - Creates an 'employee_name' column if missing but first_name and last_name exist.
-    - Combines duplicate key columns into one.
-    - Forces key columns (tin and employee_name) to be strings.
     """
-    # Drop any column that appears to be a middle name
     middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
     if middle_name_cols:
         df = df.drop(columns=middle_name_cols)
-    # Clean all column names so that "Employee Name" becomes "employee_name", etc.
     df.columns = [clean_column_name(col) for col in df.columns]
-    # Build a rename map for TIN and salary synonyms.
     rename_map = {}
     for col in df.columns:
         if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
@@ -66,7 +63,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     if rename_map:
         df = df.rename(columns=rename_map)
-    # Combine duplicate columns for salary and tin if needed
     if 'salary' in df.columns and list(df.columns).count('salary') > 1:
         salary_cols = [col for col in df.columns if col == 'salary']
         df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
@@ -76,11 +73,11 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
         df = df.loc[:, ~df.columns.duplicated()]
-    # If employee_name is missing but first_name and last_name exist, create it.
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
-    # Ensure key columns are of the proper type.
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
     if 'tin' in df.columns:
@@ -92,9 +89,7 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     """
-    Analyze DataFrame columns using Gemini AI.
-    Returns a JSON object with details about columns, key columns for merging,
-    any data quality issues, and suggested renames.
     """
     try:
         display_df = df.head(5).copy()
@@ -102,35 +97,12 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
             display_df[col] = display_df[col].astype(str)
         sample_csv = display_df.to_csv(index=False)
         prompt = f"""
-        Analyze this CSV data, which may represent an employee earnings schedule, PAYE figures, or template info containing additional details for payroll processing. Provide an analysis in JSON format.
         Filename: {filename}
         Sample data (first 5 rows):
         {sample_csv}
-        For merging these datasets, key columns are essential. The earnings schedule is considered the master file, and PAYE figures and template info should be merged into it using common identifiers such as Tax Identification Number (TIN), Employee ID, or Employee Name if unique.
-        Please analyze the columns in the sample data and identify potential key columns for merging. Also, report any data quality issues and suggest renames to standardize the column names.
-        Respond with ONLY a valid JSON object in the following format:
-        {{
-            "subject": "Employee payroll data analysis",
-            "columns": [
-                {{
-                    "name": "column_name",
-                    "type": "string/number/date",
-                    "description": "Brief description of the column and its likely content."
-                }}
-            ],
-            "key_columns": ["List of identified key column names. Prioritize employee identifiers like employee_id, tin, or employee_name."],
-            "issues": ["List any data quality issues found, like missing values in important columns."],
-            "suggested_renames": {{
-                "old_name": "new_name"
-            }}
-        }}
-        Ensure the JSON response is valid and parsable.
         """
         response = model.generate_content(prompt)
         response_text = response.text.strip()
@@ -146,27 +118,14 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
             st.error(f"JSON parsing error: {str(je)}")
             st.text("Raw response:")
             st.text(response_text)
-            return {
-                "subject": "Error parsing analysis",
-                "columns": [],
-                "key_columns": [],
-                "issues": ["Error analyzing columns"],
-                "suggested_renames": {},
-            }
     except Exception as e:
         st.error(f"Error in column analysis: {str(e)}")
-        return {
-            "subject": "Error in analysis",
-            "columns": [],
-            "key_columns": [],
-            "issues": [str(e)],
-            "suggested_renames": {},
-        }
 def read_excel_file(file) -> pd.DataFrame:
     """
     Read an Excel file with error handling.
-    Tries openpyxl first and falls back to xlrd.
     """
     try:
         return pd.read_excel(file, engine="openpyxl")
@@ -179,19 +138,17 @@ def read_excel_file(file) -> pd.DataFrame:
 def merge_with_master(processed_files):
     """
-    Merge multiple DataFrames using a two-step process:
-    1. Use the earnings file as master and drop its inaccurate 'tin' column.
-    2. Merge template info onto earnings using 'employee_name' (the key provided by "Employee Name").
-       The trusted 'tin' comes from the template file.
-       For the template file, force its first column (which is "Personal ID of Employee") to be 'tin'.
-    3. Check that the merged earnings-template data has a 'tin' column populated.
-       If present, merge the resulting DataFrame with the PAYE file using 'tin'.
     """
     earnings_file = None
     paye_file = None
     template_file = None
-    # Identify files based on filename keywords.
     for file_info in processed_files:
         lower_filename = file_info["filename"].lower()
         if "earnings" in lower_filename:
@@ -205,49 +162,43 @@ def merge_with_master(processed_files):
         st.warning("No earnings file found as master. Using the first file as master.")
         earnings_file = processed_files[0]
-    # Use the earnings DataFrame as the master.
     earnings_df = earnings_file["df"]
-    # Drop the inaccurate 'tin' column from earnings if it exists.
     if 'tin' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['tin'])
-    # Remove any middle_name column.
     if 'middle_name' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['middle_name'])
     merged_df = earnings_df.copy()
-    # Merge template info onto earnings using 'employee_name'
     if template_file is not None:
         st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
         template_df = template_file["df"].copy()
-        # Force the first column of the template file to be 'tin'
         if not template_df.empty:
             cols = list(template_df.columns)
             cols[0] = "tin"
             template_df.columns = cols
-        # Remove any middle_name column from the template file.
         if 'middle_name' in template_df.columns:
             template_df = template_df.drop(columns=['middle_name'])
-        # Ensure the template has an 'employee_name' column.
         if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
             template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
-        # If after standardization the template still doesn't have employee_name,
-        # you may need to construct it manually if possible.
         if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
             merged_df = merged_df.merge(template_df, on='employee_name', how='left', suffixes=('', '_template'))
         else:
-            st.warning("Column 'employee_name' missing in either the earnings or template file. Skipping template merge.")
     else:
-        st.warning("No template file detected. Cannot proceed without a trusted TIN from the template.")
-    # Check that a trusted 'tin' column exists from the template merge.
     if 'tin' not in merged_df.columns or merged_df['tin'].isnull().all():
         st.error("No trusted 'tin' column found in the merged earnings-template data. Aborting further merge. "
-                 "Please ensure the template file's first column holds the trusted TIN and is properly standardized.")
         return merged_df
-    # Merge PAYE figures onto the merged DataFrame using 'tin'
     if paye_file is not None:
         st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
         paye_df = paye_file["df"]
@@ -262,17 +213,14 @@ def merge_with_master(processed_files):
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Prepare DataFrame for safe display in Streamlit by converting all entries to strings
-    and replacing common null placeholders.
     """
     return df.astype(str).replace({"nan": "", "None": ""})
 def main():
     st.title("Smart CSV Processor")
     st.write("Upload CSV or Excel files for intelligent analysis and merging.")
-    uploaded_files = st.file_uploader(
-        "Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"]
-    )
     if uploaded_files:
         st.write("### Processing Files")
         processed_files = []
@@ -287,7 +235,6 @@ def main():
                     if df.empty:
                         st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
                         continue
-                    # Standardize columns and key identifiers.
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
@@ -296,12 +243,9 @@ def main():
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
-                        # Apply any suggested renames from the analysis.
                         if 'suggested_renames' in analysis:
                             df = df.rename(columns=analysis['suggested_renames'])
-                        processed_files.append(
-                            {"filename": uploaded_file.name, "df": df, "analysis": analysis}
-                        )
                 else:
                     st.error(f"Could not read data from '{uploaded_file.name}'.")
             except Exception as e:
@@ -317,12 +261,7 @@ def main():
                     st.dataframe(safe_display_df(merged_df.head()))
                     try:
                         csv = merged_df.to_csv(index=False)
-                        st.download_button(
-                            label="Download Merged CSV",
-                            data=csv,
-                            file_name="merged_data.csv",
-                            mime="text/csv",
-                        )
                         st.write("### Dataset Statistics")
                         st.write(f"Total rows: {len(merged_df)}")
                         st.write(f"Total columns: {len(merged_df.columns)}")

 def clean_column_name(col_name):
     """
+    Clean column names: convert to lowercase, replace non-alphanumeric characters with underscores.
     """
     if not isinstance(col_name, str):
         return str(col_name)
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
     Standardize DataFrame column names and data types:
+      - Drop any middle name columns.
+      - Clean column names (e.g. "Employee Name" becomes "employee_name").
+      - Rename synonyms (e.g., "Personal ID of Employee" to "tin").
+      - If missing, construct an 'employee_name' column from first and last names.
+      - Ensure key columns (tin and employee_name) are strings.
     """
+    # Drop columns containing 'middle_name'
     middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
     if middle_name_cols:
         df = df.drop(columns=middle_name_cols)
+    # Clean all column names
     df.columns = [clean_column_name(col) for col in df.columns]
+    # Rename synonyms for TIN and salary
     rename_map = {}
     for col in df.columns:
         if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
     if rename_map:
         df = df.rename(columns=rename_map)
+    # Combine duplicate columns if necessary
     if 'salary' in df.columns and list(df.columns).count('salary') > 1:
         salary_cols = [col for col in df.columns if col == 'salary']
         df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
         df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
         df = df.loc[:, ~df.columns.duplicated()]
+    # Construct employee_name if missing
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
+    # Ensure proper types for key columns
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
     if 'tin' in df.columns:
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     """
+    Use Gemini AI to analyze DataFrame columns and suggest key columns and renames.
     """
     try:
         display_df = df.head(5).copy()
             display_df[col] = display_df[col].astype(str)
         sample_csv = display_df.to_csv(index=False)
         prompt = f"""
+        Analyze this CSV data, which may represent an employee earnings schedule, PAYE figures, or template info for payroll processing.
         Filename: {filename}
         Sample data (first 5 rows):
         {sample_csv}
+        Identify potential key columns for merging and suggest renames.
+        Respond with a valid JSON object.
         """
         response = model.generate_content(prompt)
         response_text = response.text.strip()
             st.error(f"JSON parsing error: {str(je)}")
             st.text("Raw response:")
             st.text(response_text)
+            return {"subject": "Error parsing analysis", "columns": [], "key_columns": [], "issues": ["Error analyzing columns"], "suggested_renames": {}}
     except Exception as e:
         st.error(f"Error in column analysis: {str(e)}")
+        return {"subject": "Error in analysis", "columns": [], "key_columns": [], "issues": [str(e)], "suggested_renames": {}}
 def read_excel_file(file) -> pd.DataFrame:
     """
     Read an Excel file with error handling.
     """
     try:
         return pd.read_excel(file, engine="openpyxl")
 def merge_with_master(processed_files):
     """
+    Merge DataFrames in two steps:
+      1. Use the earnings file as master (dropping its inaccurate 'tin').
+      2. Merge the template file (which supplies the trusted TIN via its first column)
+         with the earnings data using 'employee_name'.
+      3. Finally, merge the combined data with the PAYE file using 'tin'.
     """
     earnings_file = None
     paye_file = None
     template_file = None
+    # Identify files by filename keywords
     for file_info in processed_files:
         lower_filename = file_info["filename"].lower()
         if "earnings" in lower_filename:
         st.warning("No earnings file found as master. Using the first file as master.")
         earnings_file = processed_files[0]
+    # Process earnings file: drop its inaccurate TIN column
     earnings_df = earnings_file["df"]
     if 'tin' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['tin'])
     if 'middle_name' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['middle_name'])
     merged_df = earnings_df.copy()
+    # Process and merge the template file using employee_name
     if template_file is not None:
         st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
         template_df = template_file["df"].copy()
+        # Force the first column (Personal ID of Employee) to be 'tin'
         if not template_df.empty:
             cols = list(template_df.columns)
             cols[0] = "tin"
             template_df.columns = cols
         if 'middle_name' in template_df.columns:
             template_df = template_df.drop(columns=['middle_name'])
+        # If employee_name is not present, construct it from first_name and last_name
         if 'employee_name' not in template_df.columns and 'first_name' in template_df.columns and 'last_name' in template_df.columns:
             template_df['employee_name'] = template_df['first_name'].astype(str).str.strip() + ' ' + template_df['last_name'].astype(str).str.strip()
         if 'employee_name' in merged_df.columns and 'employee_name' in template_df.columns:
             merged_df = merged_df.merge(template_df, on='employee_name', how='left', suffixes=('', '_template'))
         else:
+            st.warning("Column 'employee_name' missing in either earnings or template file. Skipping template merge.")
     else:
+        st.warning("No template file detected. Cannot proceed without a trusted TIN.")
+    # Check for a trusted 'tin' column after merging earnings and template
     if 'tin' not in merged_df.columns or merged_df['tin'].isnull().all():
         st.error("No trusted 'tin' column found in the merged earnings-template data. Aborting further merge. "
+                 "Ensure the template file's first column (Personal ID of Employee) is correctly populated.")
         return merged_df
+    # Merge PAYE file using the trusted 'tin'
     if paye_file is not None:
         st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
         paye_df = paye_file["df"]
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
     """
+    Convert all entries in the DataFrame to strings and replace common null placeholders.
     """
     return df.astype(str).replace({"nan": "", "None": ""})
 def main():
     st.title("Smart CSV Processor")
     st.write("Upload CSV or Excel files for intelligent analysis and merging.")
+    uploaded_files = st.file_uploader("Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"])
     if uploaded_files:
         st.write("### Processing Files")
         processed_files = []
                     if df.empty:
                         st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
                         continue
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
                         if 'suggested_renames' in analysis:
                             df = df.rename(columns=analysis['suggested_renames'])
+                        processed_files.append({"filename": uploaded_file.name, "df": df, "analysis": analysis})
                 else:
                     st.error(f"Could not read data from '{uploaded_file.name}'.")
             except Exception as e:
                     st.dataframe(safe_display_df(merged_df.head()))
                     try:
                         csv = merged_df.to_csv(index=False)
+                        st.download_button(label="Download Merged CSV", data=csv, file_name="merged_data.csv", mime="text/csv")
                         st.write("### Dataset Statistics")
                         st.write(f"Total rows: {len(merged_df)}")
                         st.write(f"Total columns: {len(merged_df.columns)}")