Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

3bfe933

verified ·

1 Parent(s): 9cc35d1

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -118

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -14,119 +13,123 @@ genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
-    """Clean column names to be compatible with Arrow"""
-    cleaned = re.sub(r"[^\w\s]", " ", str(col_name).lower())
-    cleaned = re.sub(r"\s+", "_", cleaned.strip())
-    # Preserve currency indicators
-    if "usd" in cleaned:
-        return cleaned.replace("usd", "_usd")
-    if "zw" in cleaned:
-        return cleaned.replace("zw", "_zw")
-    return cleaned
 def clean_tin_value(val):
-    """Clean TIN while preserving format"""
-    val_str = str(val).strip().upper()
-    # Remove trailing .0 but keep hyphens and letters
-    val_str = re.sub(r"\.0$", "", val_str)
-    return re.sub(r"[^\w-]", "", val_str)
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    """Enhanced standardization for multi-currency support"""
     rename_map = {}
-    currency_keywords = {
-        'salary': ['salary', 'wage', 'earning'],
-        'overtime': ['overtime'],
-        'bonus': ['bonus'],
-        'gratuity': ['gratuity'],
-        'housing': ['housing'],
-        'vehicle': ['vehicle'],
-        'pension': ['pension'],
-        'nssa': ['nssa']
-    }
     for col in df.columns:
         col_lower = col.lower()
-        # Handle TIN first
-        if any(kw in col_lower for kw in ['tin', 'personal_id', 'tax_id']):
             rename_map[col] = 'tin'
-            continue
-        # Handle currency columns
-        found = False
-        for base_name, keywords in currency_keywords.items():
-            if any(kw in col_lower for kw in keywords):
-                currency = '_usd' if 'usd' in col_lower else '_zwl' if any(kw in col_lower for kw in ['zw', 'zwl', 'zwg']) else ''
-                new_name = f"{base_name}{currency}"
-                rename_map[col] = new_name
-                found = True
-                break
-        if not found:
-            if 'name' in col_lower:
-                rename_map[col] = 'employee_name'
-    # Apply renaming and handle duplicates
-    df = df.rename(columns=rename_map)
-    # Merge similar columns
-    for base in currency_keywords.keys():
-        cols = [c for c in df.columns if c.startswith(base)]
-        if len(cols) > 1:
-            df[base] = df[cols].bfill(axis=1).iloc[:, 0]
-            df = df.drop(columns=cols)
-    # Create employee_name if split
-    if 'employee_name' not in df.columns and {'first_name', 'last_name'}.issubset(df.columns):
-        df['employee_name'] = df['first_name'] + ' ' + df['last_name']
-    # Clean TIN column
     if 'tin' in df.columns:
-        df['tin'] = df['tin'].apply(clean_tin_value).str.strip()
     return df
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
-    """Improved analysis prompt for financial data"""
     try:
-        sample_data = df.head(3).to_dict()
-        prompt = f"""Analyze this payroll data from {filename}. Focus on currency columns (USD/ZWL) and employee identifiers.
-        Return JSON with columns, key fields, and merging suggestions. Sample: {sample_data}"""
         response = model.generate_content(prompt)
-        return json.loads(response.text.replace('```json', '').replace('```', ''))
-    except:
-        return {"key_columns": ["tin", "employee_name"]}
-def merge_with_master(processed_files):
-    """Enhanced merging with fuzzy matching"""
-    master_df = next((f["df"] for f in processed_files if "paye" in f["filename"].lower()), None)
-    if not master_df:
-        master_df = processed_files[0]["df"]
-    for other in processed_files:
-        if other["df"] is master_df: continue
-        # Fuzzy match on TIN and names
-        other_df = other["df"]
-        merge_keys = []
-        if 'tin' in master_df and 'tin' in other_df:
-            master_df['clean_tin'] = master_df['tin'].apply(clean_tin_value)
-            other_df['clean_tin'] = other_df['tin'].apply(clean_tin_value)
-            merge_keys.append('clean_tin')
-        if 'employee_name' in both:
-            master_df['clean_name'] = master_df['employee_name'].str.lower().str.strip()
-            other_df['clean_name'] = other_df['employee_name'].str.lower().str.strip()
-            merge_keys.append('clean_name')
-        if merge_keys:
-            master_df = pd.merge(master_df, other_df, on=merge_keys, how='left', suffixes=('', '_drop'))
-            master_df = master_df.loc[:, ~master_df.columns.str.endswith('_drop')]
-    return master_df
 def read_excel_file(file) -> pd.DataFrame:
-    """Read Excel file with improved error handling"""
     try:
         return pd.read_excel(file, engine="openpyxl")
     except Exception as e1:
@@ -136,63 +139,83 @@ def read_excel_file(file) -> pd.DataFrame:
             st.error(f"Failed to read Excel file: {str(e2)}")
             return None
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
-    """Ensure DataFrame is safe for display in Streamlit"""
     return df.astype(str).replace({"nan": "", "None": ""})
 def main():
     st.title("Smart CSV Processor")
     st.write("Upload CSV or Excel files for intelligent analysis and merging.")
     uploaded_files = st.file_uploader(
         "Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"]
     )
     if uploaded_files:
         st.write("### Processing Files")
         processed_files = []
         for uploaded_file in uploaded_files:
             st.write(f"#### Analyzing: {uploaded_file.name}")
             try:
                 if uploaded_file.name.endswith((".xlsx", ".xls")):
                     df = read_excel_file(uploaded_file)
                 else:
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
                     df.columns = [clean_column_name(col) for col in df.columns]
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
                     with st.spinner("Analyzing columns..."):
                         analysis = analyze_columns(df, uploaded_file.name)
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
                         processed_files.append(
                             {"filename": uploaded_file.name, "df": df, "analysis": analysis}
                         )
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                 continue
         if len(processed_files) > 1:
             st.write("### Merging DataFrames with Earnings Schedule as Master")
             merged_df = merge_with_master(processed_files)
             if merged_df is not None:
                 st.write("### Preview of Merged Data")
                 st.dataframe(safe_display_df(merged_df.head()))
                 try:
                     csv = merged_df.to_csv(index=False)
                     st.download_button(
@@ -201,11 +224,9 @@ def main():
                         file_name="merged_data.csv",
                         mime="text/csv",
                     )
                     st.write("### Dataset Statistics")
                     st.write(f"Total rows: {len(merged_df)}")
                     st.write(f"Total columns: {len(merged_df.columns)}")
                     st.write("### Data Quality Metrics")
                     missing_df = pd.DataFrame(
                         {
@@ -215,15 +236,12 @@ def main():
                         }
                     )
                     st.dataframe(missing_df)
                     duplicates = merged_df.duplicated().sum()
                     st.write(f"Number of duplicate rows: {duplicates}")
                 except Exception as e:
                     st.error(f"Error preparing download: {str(e)}")
         else:
             st.warning("Please upload at least 2 files to merge.")
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
 import numpy as np
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
+    """Clean column names to be compatible with Arrow."""
+    if not isinstance(col_name, str):
+        return str(col_name)
+    cleaned = re.sub(r"[^\w\s]", " ", col_name)
+    return re.sub(r"\s+", "_", cleaned.strip().lower())
 def clean_tin_value(val):
+    """
+    Clean the TIN value by stripping whitespace and, if it ends with '.0', converting it to an integer string.
+    """
+    val_str = str(val).strip()
+    if val_str.endswith('.0'):
+        try:
+            return str(int(float(val_str)))
+        except Exception:
+            return val_str
+    return val_str
 def standardize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Standardize DataFrame column names and data types.
+    - Renames synonyms to common names (e.g., 'tin', 'salary').
+    - Creates an 'employee_name' column if missing but first_name and last_name exist.
+    - Combines duplicate key columns (e.g., multiple 'salary' or 'tin' columns) into one.
+    - Forces the key columns 'tin' and 'employee_name' to be strings.
+    """
     rename_map = {}
     for col in df.columns:
         col_lower = col.lower()
+        if col_lower in ['personal id', 'personal_id', 'tax id', 'taxid'] or "personal_id_of_employee" in col_lower:
             rename_map[col] = 'tin'
+        elif 'tin' in col_lower:
+            rename_map[col] = 'tin'
+        if any(keyword in col_lower for keyword in ['salary', 'wage', 'earning', 'commission', 'fee', 'payment', 'compensation']):
+            rename_map[col] = 'salary'
+    if rename_map:
+        df = df.rename(columns=rename_map)
+    if 'salary' in df.columns and list(df.columns).count('salary') > 1:
+        salary_cols = [col for col in df.columns if col == 'salary']
+        df['salary'] = df[salary_cols].bfill(axis=1).iloc[:, 0]
+        df = df.loc[:, ~df.columns.duplicated()]
+    if 'tin' in df.columns and list(df.columns).count('tin') > 1:
+        tin_cols = [col for col in df.columns if col == 'tin']
+        df['tin'] = df[tin_cols].bfill(axis=1).iloc[:, 0]
+        df = df.loc[:, ~df.columns.duplicated()]
+    if 'employee_name' not in df.columns and 'first_name' in df.columns and 'last_name' in df.columns:
+        df['employee_name'] = df['first_name'].astype(str).str.strip() + ' ' + df['last_name'].astype(str).str.strip()
+    if 'salary' in df.columns:
+        df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
     if 'tin' in df.columns:
+        df['tin'] = df['tin'].fillna('').astype(str).apply(clean_tin_value)
+    if 'employee_name' in df.columns:
+        df['employee_name'] = df['employee_name'].fillna('').astype(str).str.strip()
     return df
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
+    """Analyze DataFrame columns using Gemini AI with improved error handling."""
     try:
+        display_df = df.head(5).copy()
+        for col in display_df.columns:
+            display_df[col] = display_df[col].astype(str)
+        sample_csv = display_df.to_csv(index=False)
+        prompt = f"""
+        Analyze this CSV data and provide analysis in JSON format.
+        Filename: {filename}
+        Sample data:
+        {sample_csv}
+        Respond with only a valid JSON object in this format:
+        {{
+            "subject": "Employee payroll data",
+            "columns": [
+                {{
+                    "name": "column_name",
+                    "type": "string/number/date",
+                    "description": "Brief description"
+                }}
+            ],
+            "key_columns": ["employee_id", "tin"],
+            "issues": ["Missing values in salary column"],
+            "suggested_renames": {{
+                "old_name": "new_name"
+            }}
+        }}
+        """
         response = model.generate_content(prompt)
+        response_text = response.text.strip()
+        if response_text.startswith("```json"):
+            response_text = response_text[7:-3]
+        elif response_text.startswith("```"):
+            response_text = response_text[3:-3]
+        response_text = response_text.strip()
+        try:
+            analysis = json.loads(response_text)
+            return analysis
+        except json.JSONDecodeError as je:
+            st.error(f"JSON parsing error: {str(je)}")
+            st.text("Raw response:")
+            st.text(response_text)
+            return {
+                "subject": "Error parsing analysis",
+                "columns": [],
+                "key_columns": [],
+                "issues": ["Error analyzing columns"],
+                "suggested_renames": {},
+            }
+    except Exception as e:
+        st.error(f"Error in column analysis: {str(e)}")
+        return {
+            "subject": "Error in analysis",
+            "columns": [],
+            "key_columns": [],
+            "issues": [str(e)],
+            "suggested_renames": {},
+        }
 def read_excel_file(file) -> pd.DataFrame:
+    """Read Excel file with improved error handling."""
     try:
         return pd.read_excel(file, engine="openpyxl")
     except Exception as e1:
             st.error(f"Failed to read Excel file: {str(e2)}")
             return None
+def merge_with_master(processed_files):
+    """
+    Merge multiple DataFrames using the earnings schedule file as the master.
+    The master file is identified by having 'earnings' in its filename (case insensitive).
+    Other files are merged onto the master using key columns identified by AI analysis.
+    """
+    master_file = None
+    other_files = []
+    for file_info in processed_files:
+        if "earnings" in file_info["filename"].lower():
+            master_file = file_info
+        else:
+            other_files.append(file_info)
+    if not master_file:
+        st.warning("No master file with 'earnings' found. Using the first file as master.")
+        master_file = processed_files[0]
+        other_files = processed_files[1:]
+    master_df = master_file["df"]
+    master_keys = master_file["analysis"].get("key_columns", [])
+    st.write(f"Using '{master_file['filename']}' as master with key columns: {master_keys}")
+    merged_df = master_df
+    for other in other_files:
+        other_df = other["df"]
+        other_keys = other["analysis"].get("key_columns", [])
+        common_keys = list(set(master_keys).intersection(set(other_keys)))
+        if common_keys:
+            st.write(f"Merging '{other['filename']}' on keys: {common_keys}")
+            merged_df = merged_df.merge(other_df, on=common_keys, how="left")
+        else:
+            st.warning(f"No common keys found for merging '{other['filename']}'. Skipping this file.")
+    return merged_df
 def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
+    """Ensure DataFrame is safe for display in Streamlit."""
     return df.astype(str).replace({"nan": "", "None": ""})
 def main():
     st.title("Smart CSV Processor")
     st.write("Upload CSV or Excel files for intelligent analysis and merging.")
     uploaded_files = st.file_uploader(
         "Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"]
     )
     if uploaded_files:
         st.write("### Processing Files")
         processed_files = []
         for uploaded_file in uploaded_files:
             st.write(f"#### Analyzing: {uploaded_file.name}")
             try:
                 if uploaded_file.name.endswith((".xlsx", ".xls")):
                     df = read_excel_file(uploaded_file)
                 else:
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
                     df.columns = [clean_column_name(col) for col in df.columns]
                     df = standardize_dataframe(df)
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
                     with st.spinner("Analyzing columns..."):
                         analysis = analyze_columns(df, uploaded_file.name)
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
+                        # Apply suggested renames
+                        if 'suggested_renames' in analysis:
+                            df = df.rename(columns=analysis['suggested_renames'])
                         processed_files.append(
                             {"filename": uploaded_file.name, "df": df, "analysis": analysis}
                         )
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                 continue
         if len(processed_files) > 1:
             st.write("### Merging DataFrames with Earnings Schedule as Master")
             merged_df = merge_with_master(processed_files)
             if merged_df is not None:
                 st.write("### Preview of Merged Data")
                 st.dataframe(safe_display_df(merged_df.head()))
                 try:
                     csv = merged_df.to_csv(index=False)
                     st.download_button(
                         file_name="merged_data.csv",
                         mime="text/csv",
                     )
                     st.write("### Dataset Statistics")
                     st.write(f"Total rows: {len(merged_df)}")
                     st.write(f"Total columns: {len(merged_df.columns)}")
                     st.write("### Data Quality Metrics")
                     missing_df = pd.DataFrame(
                         {
                         }
                     )
                     st.dataframe(missing_df)
                     duplicates = merged_df.duplicated().sum()
                     st.write(f"Number of duplicate rows: {duplicates}")
                 except Exception as e:
                     st.error(f"Error preparing download: {str(e)}")
         else:
             st.warning("Please upload at least 2 files to merge.")
 if __name__ == "__main__":
     main()