Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

364e421

verified ·

1 Parent(s): 68a00f6

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -41

app.py CHANGED Viewed

@@ -13,56 +13,69 @@ import re
 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
     """Clean column names to be compatible with Arrow"""
     if not isinstance(col_name, str):
         return str(col_name)
     # Remove special characters and extra spaces
-    cleaned = re.sub(r'[^\w\s]', ' ', col_name)
-    return re.sub(r'\s+', '_', cleaned.strip().lower())
 def is_salary_column(column_name: str) -> bool:
     """Check if column name suggests it contains salary/monetary data"""
-    salary_keywords = ['salary', 'wage', 'income', 'earning', 'commission', 'fee', 'payment', 'compensation']
     column_lower = column_name.lower()
     return any(keyword in column_lower for keyword in salary_keywords)
 def clean_monetary_value(value):
     """Clean monetary values by removing currency symbols and converting to float"""
     if pd.isna(value):
         return np.nan
     if isinstance(value, (int, float)):
         return float(value)
     value_str = str(value)
     # Remove currency symbols, commas, and other non-numeric characters except decimal points
-    cleaned = re.sub(r'[^0-9.-]', '', value_str)
     try:
         return float(cleaned)
     except (ValueError, TypeError):
         return np.nan
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     """Analyze DataFrame columns using Gemini AI with improved error handling"""
     try:
         # Prepare sample data for analysis
         display_df = df.head(5).copy()
         # Convert all columns to string for display
         for col in display_df.columns:
             display_df[col] = display_df[col].astype(str)
         sample_csv = display_df.to_csv(index=False)
         # Create a more structured prompt
         prompt = f"""
         Analyze this CSV data and provide analysis in JSON format.
         Filename: {filename}
         Sample data:
         {sample_csv}
         Respond with only a valid JSON object in this format:
         {{
             "subject": "Employee payroll data",
@@ -80,18 +93,18 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
             }}
         }}
         """
         response = model.generate_content(prompt)
         response_text = response.text.strip()
         # Handle potential markdown code block
-        if response_text.startswith('```json'):
             response_text = response_text[7:-3]  # Remove ```json and ```
-        elif response_text.startswith('```'):
             response_text = response_text[3:-3]  # Remove ``` and ```
         response_text = response_text.strip()
         try:
             analysis = json.loads(response_text)
             return analysis
@@ -104,9 +117,9 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
                 "columns": [],
                 "key_columns": [],
                 "issues": ["Error analyzing columns"],
-                "suggested_renames": {}
             }
     except Exception as e:
         st.error(f"Error in column analysis: {str(e)}")
         return {
@@ -114,73 +127,142 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
             "columns": [],
             "key_columns": [],
             "issues": [str(e)],
-            "suggested_renames": {}
         }
 def read_excel_file(file) -> pd.DataFrame:
     """Read Excel file with improved error handling"""
     try:
         # Try reading with default engine
-        return pd.read_excel(file, engine='openpyxl')
     except Exception as e1:
         try:
             # Fallback to xlrd engine for older Excel files
-            return pd.read_excel(file, engine='xlrd')
         except Exception as e2:
             st.error(f"Failed to read Excel file: {str(e2)}")
             return None
 def main():
     st.title("Smart CSV Processor")
     st.write("Upload CSV or Excel files for intelligent analysis and merging.")
     uploaded_files = st.file_uploader(
-        "Choose files",
-        accept_multiple_files=True,
-        type=['csv', 'xlsx', 'xls']
     )
     if uploaded_files:
         st.write("### Processing Files")
         processed_files = []
         for uploaded_file in uploaded_files:
             st.write(f"#### Analyzing: {uploaded_file.name}")
             try:
                 # Read the file
-                if uploaded_file.name.endswith(('.xlsx', '.xls')):
                     df = read_excel_file(uploaded_file)
                 else:
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
                     # Clean column names
                     df.columns = [clean_column_name(col) for col in df.columns]
                     # Show initial data preview
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
                     # Analyze columns with improved error handling
                     with st.spinner("Analyzing columns..."):
                         analysis = analyze_columns(df, uploaded_file.name)
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
-                        processed_files.append({
-                            'filename': uploaded_file.name,
-                            'df': df,
-                            'analysis': analysis
-                        })
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                 continue
-        # Rest of the merging logic remains the same...
 if __name__ == "__main__":
     main()

 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_column_name(col_name):
     """Clean column names to be compatible with Arrow"""
     if not isinstance(col_name, str):
         return str(col_name)
     # Remove special characters and extra spaces
+    cleaned = re.sub(r"[^\w\s]", " ", col_name)
+    return re.sub(r"\s+", "_", cleaned.strip().lower())
 def is_salary_column(column_name: str) -> bool:
     """Check if column name suggests it contains salary/monetary data"""
+    salary_keywords = [
+        "salary",
+        "wage",
+        "income",
+        "earning",
+        "commission",
+        "fee",
+        "payment",
+        "compensation",
+    ]
     column_lower = column_name.lower()
     return any(keyword in column_lower for keyword in salary_keywords)
 def clean_monetary_value(value):
     """Clean monetary values by removing currency symbols and converting to float"""
     if pd.isna(value):
         return np.nan
     if isinstance(value, (int, float)):
         return float(value)
     value_str = str(value)
     # Remove currency symbols, commas, and other non-numeric characters except decimal points
+    cleaned = re.sub(r"[^0-9.-]", "", value_str)
     try:
         return float(cleaned)
     except (ValueError, TypeError):
         return np.nan
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     """Analyze DataFrame columns using Gemini AI with improved error handling"""
     try:
         # Prepare sample data for analysis
         display_df = df.head(5).copy()
         # Convert all columns to string for display
         for col in display_df.columns:
             display_df[col] = display_df[col].astype(str)
         sample_csv = display_df.to_csv(index=False)
         # Create a more structured prompt
         prompt = f"""
         Analyze this CSV data and provide analysis in JSON format.
         Filename: {filename}
         Sample data:
         {sample_csv}
         Respond with only a valid JSON object in this format:
         {{
             "subject": "Employee payroll data",
             }}
         }}
         """
         response = model.generate_content(prompt)
         response_text = response.text.strip()
         # Handle potential markdown code block
+        if response_text.startswith("```json"):
             response_text = response_text[7:-3]  # Remove ```json and ```
+        elif response_text.startswith("```"):
             response_text = response_text[3:-3]  # Remove ``` and ```
         response_text = response_text.strip()
         try:
             analysis = json.loads(response_text)
             return analysis
                 "columns": [],
                 "key_columns": [],
                 "issues": ["Error analyzing columns"],
+                "suggested_renames": {},
             }
     except Exception as e:
         st.error(f"Error in column analysis: {str(e)}")
         return {
             "columns": [],
             "key_columns": [],
             "issues": [str(e)],
+            "suggested_renames": {},
         }
 def read_excel_file(file) -> pd.DataFrame:
     """Read Excel file with improved error handling"""
     try:
         # Try reading with default engine
+        return pd.read_excel(file, engine="openpyxl")
     except Exception as e1:
         try:
             # Fallback to xlrd engine for older Excel files
+            return pd.read_excel(file, engine="xlrd")
         except Exception as e2:
             st.error(f"Failed to read Excel file: {str(e2)}")
             return None
+def merge_dataframes(processed_files, selected_columns):
+    """Merge multiple DataFrames on selected columns"""
+    merged_df = processed_files[0]["df"]
+    for df_info in processed_files[1:]:
+        merged_df = merged_df.merge(df_info["df"], on=selected_columns, how="outer")
+    return merged_df
+def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
+    """Ensure DataFrame is safe for display in Streamlit"""
+    return df.astype(str).replace({"nan": "", "None": ""})
 def main():
     st.title("Smart CSV Processor")
     st.write("Upload CSV or Excel files for intelligent analysis and merging.")
     uploaded_files = st.file_uploader(
+        "Choose files", accept_multiple_files=True, type=["csv", "xlsx", "xls"]
     )
     if uploaded_files:
         st.write("### Processing Files")
         processed_files = []
         for uploaded_file in uploaded_files:
             st.write(f"#### Analyzing: {uploaded_file.name}")
             try:
                 # Read the file
+                if uploaded_file.name.endswith((".xlsx", ".xls")):
                     df = read_excel_file(uploaded_file)
                 else:
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
                     # Clean column names
                     df.columns = [clean_column_name(col) for col in df.columns]
                     # Show initial data preview
                     st.write("Initial Preview:")
                     st.dataframe(df.head())
                     # Analyze columns with improved error handling
                     with st.spinner("Analyzing columns..."):
                         analysis = analyze_columns(df, uploaded_file.name)
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
+                        processed_files.append(
+                            {"filename": uploaded_file.name, "df": df, "analysis": analysis}
+                        )
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                 continue
+        if len(processed_files) > 1:
+            st.write("### Merging DataFrames")
+            # Find common columns
+            common_columns = list(
+                set.intersection(*[set(df_info["df"].columns) for df_info in processed_files])
+            )
+            if common_columns:
+                st.write("Common columns found:", common_columns)
+                selected_columns = st.multiselect(
+                    "Select columns to use for merging", options=common_columns, default=common_columns
+                )
+                if selected_columns:
+                    with st.spinner("Merging datasets..."):
+                        merged_df = merge_dataframes(processed_files, selected_columns)
+                    if merged_df is not None:
+                        st.write("### Preview of Merged Data")
+                        st.dataframe(safe_display_df(merged_df.head()))
+                        # Create downloadable CSV
+                        try:
+                            csv = merged_df.to_csv(index=False)
+                            st.download_button(
+                                label="Download Merged CSV",
+                                data=csv,
+                                file_name="merged_data.csv",
+                                mime="text/csv",
+                            )
+                            # Show statistics
+                            st.write("### Dataset Statistics")
+                            st.write(f"Total rows: {len(merged_df)}")
+                            st.write(f"Total columns: {len(merged_df.columns)}")
+                            # Data quality metrics
+                            st.write("### Data Quality Metrics")
+                            missing_df = pd.DataFrame(
+                                {
+                                    "Column": merged_df.columns,
+                                    "Missing Values": merged_df.isnull().sum().values,
+                                    "Missing Percentage": (merged_df.isnull().sum().values / len(merged_df) * 100).round(2),
+                                }
+                            )
+                            st.dataframe(missing_df)
+                            duplicates = merged_df.duplicated().sum()
+                            st.write(f"Number of duplicate rows: {duplicates}")
+                        except Exception as e:
+                            st.error(f"Error preparing download: {str(e)}")
+            else:
+                st.warning("No common columns found across datasets.")
+        else:
+            st.warning("Please upload at least 2 files to merge.")
 if __name__ == "__main__":
     main()