Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

5d021f4

verified ·

1 Parent(s): d05ef69

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -165

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -12,150 +13,115 @@ import re
 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
     """Clean column names to be compatible with Arrow"""
-    if not isinstance(col_name, str):
-        return str(col_name)
-    cleaned = re.sub(r"[^\w\s]", " ", col_name)
-    return re.sub(r"\s+", "_", cleaned.strip().lower())
 def clean_tin_value(val):
-    """
-    Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
-    """
-    val_str = str(val).strip()
-    if val_str.endswith('.0'):
-        try:
-            return str(int(float(val_str)))
-        except Exception:
-            return val_str
-    return val_str
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Standardize DataFrame column names and data types.
-    - Renames synonyms to common names (e.g., 'tin', 'salary').
-    - In particular, any header containing 'personal_id_of_employee' (or similar) or 'tin' is renamed to 'tin'.
-    - Creates an 'employee_name' column if missing but first_name and last_name exist.
-    - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
-    - Forces the key columns 'tin' and 'employee_name' to be strings.
-    """
     rename_map = {}
     for col in df.columns:
         col_lower = col.lower()
-        # Rename headers to 'tin'
-        if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
-            rename_map[col] = 'tin'
-        elif 'tin' in col_lower:
             rename_map[col] = 'tin'
-        # Rename headers to 'salary'
-        if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
-            rename_map[col] = 'salary'
-    if rename_map:
-        df = df.rename(columns=rename_map)
-    # Combine duplicate columns for 'salary'
-    if 'salary' in df.columns and list(df.columns).count('salary') > 1:
-        salary_cols = [col for col in df.columns if col == 'salary']
-        df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
-        df = df.loc[:, ~df.columns.duplicated()]
-    # Combine duplicate columns for 'tin'
-    if 'tin' in df.columns and list(df.columns).count('tin') > 1:
-        tin_cols = [col for col in df.columns if col == 'tin']
-        df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
-        df = df.loc[:, ~df.columns.duplicated()]
-    # Create employee_name if not present but first_name and last_name exist
-    if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
-        df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
-    # Ensure salary column is numeric (to avoid conversion errors later)
-    if 'salary' in df.columns:
-        df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
-    # Force key columns to be strings, filling NaNs with empty strings
     if 'tin' in df.columns:
-        df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
-    if 'employee_name' in df.columns:
-        df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
     return df
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
-    """Analyze DataFrame columns using Gemini AI with improved error handling"""
     try:
-        display_df = df.head(5).copy()
-        for col in display_df.columns:
-            display_df[col] = display_df[col].astype(str)
-        sample_csv = display_df.to_csv(index=False)
-        prompt = f"""
-        Analyze this CSV data and provide analysis in JSON format.
-        Filename: {filename}
-        Sample data:
-        {sample_csv}
-        Respond with only a valid JSON object in this format:
-        {{
-            "subject": "Employee payroll data",
-            "columns": [
-                {{
-                    "name": "column_name",
-                    "type": "string/number/date",
-                    "description": "Brief description"
-                }}
-            ],
-            "key_columns": ["employee_id", "tin"],
-            "issues": ["Missing values in salary column"],
-            "suggested_renames": {{
-                "old_name": "new_name"
-            }}
-        }}
-        """
         response = model.generate_content(prompt)
-        response_text = response.text.strip()
-        if response_text.startswith("```json"):
-            response_text = response_text[7:-3]
-        elif response_text.startswith("```"):
-            response_text = response_text[3:-3]
-        response_text = response_text.strip()
-        try:
-            analysis = json.loads(response_text)
-            return analysis
-        except json.JSONDecodeError as je:
-            st.error(f"JSON parsing error: {str(je)}")
-            st.text("Raw response:")
-            st.text(response_text)
-            return {
-                "subject": "Error parsing analysis",
-                "columns": [],
-                "key_columns": [],
-                "issues": ["Error analyzing columns"],
-                "suggested_renames": {},
-            }
-    except Exception as e:
-        st.error(f"Error in column analysis: {str(e)}")
-        return {
-            "subject": "Error in analysis",
-            "columns": [],
-            "key_columns": [],
-            "issues": [str(e)],
-            "suggested_renames": {},
-        }
 def read_excel_file(file) -> pd.DataFrame:
     """Read Excel file with improved error handling"""
@@ -169,45 +135,6 @@ def read_excel_file(file) -> pd.DataFrame:
             return None
-def merge_with_master(processed_files):
-    """
-    Merge multiple DataFrames using the earnings schedule file as the master.
-    The master file is identified by having 'earnings' in its filename (case insensitive).
-    Other files are merged onto the master using key columns (e.g., 'tin', 'employee_name').
-    """
-    master_file = None
-    other_files = []
-    for file_info in processed_files:
-        if "earnings" in file_info["filename"].lower():
-            master_file = file_info
-        else:
-            other_files.append(file_info)
-    if not master_file:
-        st.warning("No master file with 'earnings' found. Using the first file as master.")
-        master_file = processed_files[0]
-        other_files = processed_files[1:]
-    master_df = master_file["df"]
-    st.write(f"Using '{master_file['filename']}' as master for merging.")
-    default_keys = ['tin', 'employee_name']
-    merged_df = master_df
-    for other in other_files:
-        other_df = other["df"]
-        keys_to_use = [key for key in default_keys if key in other_df.columns and key in merged_df.columns]
-        if not keys_to_use:
-            keys_to_use = list(set(merged_df.columns).intersection(set(other_df.columns)))
-        if keys_to_use:
-            st.write(f"Merging '{other['filename']}' on keys: {keys_to_use}")
-            merged_df = merged_df.merge(other_df, on=keys_to_use, how="left")
-        else:
-            st.warning(f"No common keys found for merging '{other['filename']}'. Skipping this file.")
-    return merged_df
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
     """Ensure DataFrame is safe for display in Streamlit"""

 import streamlit as st
 import pandas as pd
 import numpy as np
 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
     """Clean column names to be compatible with Arrow"""
+    cleaned = re.sub(r"[^\w\s]", " ", str(col_name).lower())
+ cleaned = re.sub(r"\s+", "_", cleaned.strip())
+    # Preserve currency indicators
+    if "usd" in cleaned: return cleaned.replace("usd", "_usd")
+    if "zw" in cleaned: return cleaned.replace("zw", "_zw")
+    return cleaned
 def clean_tin_value(val):
+    """Clean TIN while preserving format"""
+    val_str = str(val).strip().upper()
+    # Remove trailing .0 but keep hyphens and letters
+    val_str = re.sub(r"\.0$", "", val_str)
+    return re.sub(r"[^\w-]", "", val_str)
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """Enhanced standardization for multi-currency support"""
     rename_map = {}
+    currency_keywords = {
+        'salary': ['salary', 'wage', 'earning'],
+        'overtime': ['overtime'],
+        'bonus': ['bonus'],
+        'gratuity': ['gratuity'],
+        'housing': ['housing'],
+        'vehicle': ['vehicle'],
+        'pension': ['pension'],
+        'nssa': ['nssa']
+    }
     for col in df.columns:
         col_lower = col.lower()
+        # Handle TIN first
+        if any(kw in col_lower for kw in ['tin', 'personal_id', 'tax_id']):
             rename_map[col] = 'tin'
+            continue
+        # Handle currency columns
+        found = False
+        for base_name, keywords in currency_keywords.items():
+            if any(kw in col_lower for kw in keywords):
+                currency = '_usd' if 'usd' in col_lower else '_zwl' if any(kw in col_lower for kw in ['zw', 'zwl', 'zwg']) else ''
+                new_name = f"{base_name}{currency}"
+                rename_map[col] = new_name
+                found = True
+                break
+        if not found:
+            if 'name' in col_lower:
+                rename_map[col] = 'employee_name'
+    # Apply renaming and handle duplicates
+    df = df.rename(columns=rename_map)
+    # Merge similar columns
+    for base in currency_keywords.keys():
+        cols = [c for c in df.columns if c.startswith(base)]
+        if len(cols) > 1:
+            df[base] = df[cols].bfill(axis=1).iloc[:, 0]
+            df = df.drop(columns=cols)
+    # Create employee_name if split
+    if 'employee_name' not in df.columns and {'first_name', 'last_name'}.issubset(df.columns):
+        df['employee_name'] = df['first_name'] + ' ' + df['last_name']
+    # Clean TIN column
     if 'tin' in df.columns:
+        df['tin'] = df['tin'].apply(clean_tin_value).str.strip()
     return df
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
+    """Improved analysis prompt for financial data"""
     try:
+        sample_data = df.head(3).to_dict()
+        prompt = f"""Analyze this payroll data from {filename}. Focus on currency columns (USD/ZWL) and employee identifiers.
+        Return JSON with columns, key fields, and merging suggestions. Sample: {sample_data}"""
         response = model.generate_content(prompt)
+        return json.loads(response.text.replace('```json', '').replace('```', ''))
+    except:
+        return {"key_columns": ["tin", "employee_name"]}
+def merge_with_master(processed_files):
+    """Enhanced merging with fuzzy matching"""
+    master_df = next((f["df"] for f in processed_files if "paye" in f["filename"].lower()), None)
+    if not master_df:
+        master_df = processed_files[0]["df"]
+    for other in processed_files:
+        if other["df"] is master_df: continue
+        # Fuzzy match on TIN and names
+        other_df = other["df"]
+        merge_keys = []
+        if 'tin' in master_df and 'tin' in other_df:
+            master_df['clean_tin'] = master_df['tin'].apply(clean_tin_value)
+            other_df['clean_tin'] = other_df['tin'].apply(clean_tin_value)
+            merge_keys.append('clean_tin')
+        if 'employee_name' in both:
+            master_df['clean_name'] = master_df['employee_name'].str.lower().str.strip()
+            other_df['clean_name'] = other_df['employee_name'].str.lower().str.strip()
+            merge_keys.append('clean_name')
+        if merge_keys:
+            master_df = pd.merge(master_df, other_df, on=merge_keys, how='left', suffixes=('', '_drop'))
+            master_df = master_df.loc[:, ~master_df.columns.str.endswith('_drop')]
+    return master_df
 def read_excel_file(file) -> pd.DataFrame:
     """Read Excel file with improved error handling"""
             return None
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
     """Ensure DataFrame is safe for display in Streamlit"""