Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 24, 2025

Commit

fc7b3ea

verified ·

1 Parent(s): 9c17313

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -28

app.py CHANGED Viewed

@@ -13,7 +13,10 @@ genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
-    """Clean column names to be compatible with Arrow."""
     if not isinstance(col_name, str):
         return str(col_name)
     cleaned = re.sub(r"[^\w\s]", " ", col_name)
@@ -21,7 +24,8 @@ def clean_column_name(col_name):
 def clean_tin_value(val):
     """
-    Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
     """
     val_str = str(val).strip()
     if val_str.endswith('.0'):
@@ -33,29 +37,35 @@ def clean_tin_value(val):
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Standardize DataFrame column names and data types.
     - Renames synonyms to common names (e.g., 'tin', 'salary').
     - Creates an 'employee_name' column if missing but first_name and last_name exist.
-    - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
     - Forces the key columns 'tin' and 'employee_name' to be strings.
-    - Drops any middle name columns.
     """
     # Drop any column that appears to be a middle name
     middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
     if middle_name_cols:
         df = df.drop(columns=middle_name_cols)
     rename_map = {}
     for col in df.columns:
-        col_lower = col.lower()
-        if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
             rename_map[col] = 'tin'
-        elif 'tin' in col_lower:
             rename_map[col] = 'tin'
-        if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
             rename_map[col] = 'salary'
     if rename_map:
         df = df.rename(columns=rename_map)
     if 'salary' in df.columns and list(df.columns).count('salary') > 1:
         salary_cols = [col for col in df.columns if col == 'salary']
         df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
@@ -64,18 +74,27 @@ def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         tin_cols = [col for col in df.columns if col == 'tin']
         df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
         df = df.loc[:, ~df.columns.duplicated()]
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
     if 'tin' in df.columns:
         df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
     if 'employee_name' in df.columns:
         df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
     return df
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
-    """Analyze DataFrame columns using Gemini AI with an updated prompt."""
     try:
         display_df = df.head(5).copy()
         for col in display_df.columns:
@@ -144,7 +163,10 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
         }
 def read_excel_file(file) -> pd.DataFrame:
-    """Read Excel file with improved error handling."""
     try:
         return pd.read_excel(file, engine="openpyxl")
     except Exception as e1:
@@ -158,12 +180,14 @@ def merge_with_master(processed_files):
     """
     Merge multiple DataFrames using a two-step process:
     1. Use the earnings file as the master. Drop its inaccurate 'tin' column.
-    2. Merge template info onto earnings via 'employee_name' (constructed from first and last names).
     3. Then merge the resulting DataFrame with the PAYE file using the (correct) 'tin' key.
     """
     earnings_file = None
     paye_file = None
     template_file = None
     for file_info in processed_files:
         lower_filename = file_info["filename"].lower()
         if "earnings" in lower_filename:
@@ -176,20 +200,20 @@ def merge_with_master(processed_files):
         st.warning("No earnings file found as master. Using the first file as master.")
         earnings_file = processed_files[0]
-    # Start with the earnings DataFrame as master
     earnings_df = earnings_file["df"]
     # Drop the inaccurate 'tin' column from earnings, if present
     if 'tin' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['tin'])
-    # Ensure any middle_name column is dropped (already handled in standardization, but double-check)
     if 'middle_name' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['middle_name'])
     merged_df = earnings_df.copy()
-    # Merge template info onto earnings using 'employee_name' as key
     if template_file is not None:
-        st.write(f"Merging template info from '{template_file['filename']}' onto the earnings sheet using key 'employee_name'.")
         template_df = template_file["df"]
         # Drop any middle_name column from the template file
         if 'middle_name' in template_df.columns:
@@ -206,7 +230,7 @@ def merge_with_master(processed_files):
     # Merge PAYE figures onto the merged DataFrame using 'tin'
     if paye_file is not None:
-        st.write(f"Merging PAYE figures from '{paye_file['filename']}' onto the merged sheet using key 'tin'.")
         paye_df = paye_file["df"]
         if 'tin' in merged_df.columns and 'tin' in paye_df.columns:
             merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
@@ -218,7 +242,10 @@ def merge_with_master(processed_files):
     return merged_df
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
-    """Ensure DataFrame is safe for display in Streamlit."""
     return df.astype(str).replace({"nan": "", "None": ""})
 def main():
@@ -239,9 +266,9 @@ def main():
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
                     if df.empty:
-                        st.warning(f"DataFrame from '{uploaded_file.name}' is empty after reading. Please check the file.")
                         continue
-                    df.columns = [clean_column_name(col) for col in df.columns]
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
@@ -250,7 +277,7 @@ def main():
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
-                        # Apply suggested renames if provided
                         if 'suggested_renames' in analysis:
                             df = df.rename(columns=analysis['suggested_renames'])
                         processed_files.append(
@@ -281,13 +308,11 @@ def main():
                         st.write(f"Total rows: {len(merged_df)}")
                         st.write(f"Total columns: {len(merged_df.columns)}")
                         st.write("### Data Quality Metrics")
-                        missing_df = pd.DataFrame(
-                            {
-                                "Column": merged_df.columns,
-                                "Missing Values": merged_df.isnull().sum().values,
-                                "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
-                            }
-                        )
                         st.dataframe(missing_df)
                         duplicates = merged_df.duplicated().sum()
                         st.write(f"Number of duplicate rows: {duplicates}")

 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
+    """
+    Clean column names to be compatible with Arrow.
+    Converts to lowercase and replaces non-alphanumeric characters with underscores.
+    """
     if not isinstance(col_name, str):
         return str(col_name)
     cleaned = re.sub(r"[^\w\s]", " ", col_name)
 def clean_tin_value(val):
     """
+    Clean the TIN value by stripping whitespace and, if it ends with '.0',
+    converting it to an integer string.
     """
     val_str = str(val).strip()
     if val_str.endswith('.0'):
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """
+    Standardize DataFrame column names and data types:
+    - Drops any middle name columns.
+    - Cleans all column names (e.g., "Employee Name" -> "employee_name").
     - Renames synonyms to common names (e.g., 'tin', 'salary').
     - Creates an 'employee_name' column if missing but first_name and last_name exist.
+    - Combines duplicate key columns into one.
     - Forces the key columns 'tin' and 'employee_name' to be strings.
     """
     # Drop any column that appears to be a middle name
     middle_name_cols = [col for col in df.columns if 'middle_name' in col.lower()]
     if middle_name_cols:
         df = df.drop(columns=middle_name_cols)
+    # Clean all column names first so that "Employee Name" becomes "employee_name"
+    df.columns = [clean_column_name(col) for col in df.columns]
+    # Rename columns based on synonyms for TIN and salary
     rename_map = {}
     for col in df.columns:
+        if col in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col:
             rename_map[col] = 'tin'
+        elif 'tin' in col:
             rename_map[col] = 'tin'
+        if any(keyword in col for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
             rename_map[col] = 'salary'
     if rename_map:
         df = df.rename(columns=rename_map)
+    # Combine duplicate columns (e.g., multiple salary or tin columns)
     if 'salary' in df.columns and list(df.columns).count('salary') > 1:
         salary_cols = [col for col in df.columns if col == 'salary']
         df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
         tin_cols = [col for col in df.columns if col == 'tin']
         df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
         df = df.loc[:, ~df.columns.duplicated()]
+    # If employee_name is missing and first_name and last_name exist, create it.
     if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
         df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
+    # Ensure key columns are of the correct type
     if 'salary' in df.columns:
         df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
     if 'tin' in df.columns:
         df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
     if 'employee_name' in df.columns:
         df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
     return df
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
+    """
+    Analyze DataFrame columns using Gemini AI.
+    Returns a JSON object with details about columns, key columns for merging,
+    any data quality issues, and suggested renames.
+    """
     try:
         display_df = df.head(5).copy()
         for col in display_df.columns:
         }
 def read_excel_file(file) -> pd.DataFrame:
+    """
+    Read an Excel file with error handling.
+    Tries openpyxl first and falls back to xlrd.
+    """
     try:
         return pd.read_excel(file, engine="openpyxl")
     except Exception as e1:
     """
     Merge multiple DataFrames using a two-step process:
     1. Use the earnings file as the master. Drop its inaccurate 'tin' column.
+    2. Merge template info onto earnings via 'employee_name' (the key provided by "Employee Name").
     3. Then merge the resulting DataFrame with the PAYE file using the (correct) 'tin' key.
     """
     earnings_file = None
     paye_file = None
     template_file = None
+    # Identify files based on filename keywords
     for file_info in processed_files:
         lower_filename = file_info["filename"].lower()
         if "earnings" in lower_filename:
         st.warning("No earnings file found as master. Using the first file as master.")
         earnings_file = processed_files[0]
+    # Use the earnings DataFrame as the master
     earnings_df = earnings_file["df"]
     # Drop the inaccurate 'tin' column from earnings, if present
     if 'tin' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['tin'])
+    # Double-check removal of any middle_name column (should already be done in standardization)
     if 'middle_name' in earnings_df.columns:
         earnings_df = earnings_df.drop(columns=['middle_name'])
     merged_df = earnings_df.copy()
+    # Merge template info onto earnings using 'employee_name'
     if template_file is not None:
+        st.write(f"Merging template info from '{template_file['filename']}' using key 'employee_name'.")
         template_df = template_file["df"]
         # Drop any middle_name column from the template file
         if 'middle_name' in template_df.columns:
     # Merge PAYE figures onto the merged DataFrame using 'tin'
     if paye_file is not None:
+        st.write(f"Merging PAYE figures from '{paye_file['filename']}' using key 'tin'.")
         paye_df = paye_file["df"]
         if 'tin' in merged_df.columns and 'tin' in paye_df.columns:
             merged_df = merged_df.merge(paye_df, on='tin', how='left', suffixes=('', '_paye'))
     return merged_df
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Prepare DataFrame for safe display in Streamlit by converting all entries to strings
+    and replacing common null placeholders.
+    """
     return df.astype(str).replace({"nan": "", "None": ""})
 def main():
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
                     if df.empty:
+                        st.warning(f"DataFrame from '{uploaded_file.name}' is empty. Please check the file.")
                         continue
+                    # Standardize column names and key columns
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
+                        # Apply suggested renames if provided by the analysis
                         if 'suggested_renames' in analysis:
                             df = df.rename(columns=analysis['suggested_renames'])
                         processed_files.append(
                         st.write(f"Total rows: {len(merged_df)}")
                         st.write(f"Total columns: {len(merged_df.columns)}")
                         st.write("### Data Quality Metrics")
+                        missing_df = pd.DataFrame({
+                            "Column": merged_df.columns,
+                            "Missing Values": merged_df.isnull().sum().values,
+                            "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
+                        })
                         st.dataframe(missing_df)
                         duplicates = merged_df.duplicated().sum()
                         st.write(f"Number of duplicate rows: {duplicates}")