Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

6250aa7

verified ·

1 Parent(s): 3a17237

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -41

app.py CHANGED Viewed

@@ -11,11 +11,44 @@ import tempfile
 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def convert_excel_to_csv(excel_file):
     """Convert Excel file to CSV and return the DataFrame"""
     try:
         df = pd.read_excel(excel_file)
-        return df
     except Exception as e:
         st.error(f"Error converting Excel file: {str(e)}")
         return None
@@ -23,7 +56,11 @@ def convert_excel_to_csv(excel_file):
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     """Analyze DataFrame columns using Gemini AI"""
     # Convert sample of DataFrame to CSV string
-    sample_csv = df.head(5).to_csv(index=False)
     analysis_prompt = f"""
     Analyze this CSV data from file '{filename}' and provide the following in JSON format:
@@ -80,6 +117,14 @@ def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.Da
     # Merge with remaining DataFrames
     for df_info in dataframes[1:]:
         try:
             merged_df = pd.merge(
                 merged_df,
                 df_info['df'],
@@ -91,7 +136,20 @@ def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.Da
             st.error(f"Error merging {df_info['filename']}: {str(e)}")
             continue
-    return merged_df
 def main():
     st.title("Smart CSV Processor")
@@ -113,38 +171,42 @@ def main():
         for uploaded_file in uploaded_files:
             st.write(f"#### Analyzing: {uploaded_file.name}")
-            # Read file into DataFrame
-            if uploaded_file.name.endswith(('.xlsx', '.xls')):
-                df = convert_excel_to_csv(uploaded_file)
-            else:
-                df = pd.read_csv(uploaded_file)
-            if df is not None:
-                # Show initial data preview
-                st.write("Initial Preview:")
-                st.dataframe(df.head())
-                # Analyze columns using Gemini
-                with st.spinner("Analyzing columns with AI..."):
-                    analysis = analyze_columns(df, uploaded_file.name)
-                if analysis:
-                    st.write("Column Analysis:")
-                    st.json(analysis)
-                    # Store DataFrame and its analysis
-                    processed_files.append({
-                        'filename': uploaded_file.name,
-                        'df': df,
-                        'analysis': analysis
-                    })
-                    # Apply suggested column renames if any
-                    if 'suggested_renames' in analysis and analysis['suggested_renames']:
-                        df.rename(columns=analysis['suggested_renames'], inplace=True)
-                        st.write("Applied suggested column renames.")
-                        st.write("Updated Preview:")
-                        st.dataframe(df.head())
         if len(processed_files) > 1:
             st.write("### Merging DataFrames")
@@ -169,16 +231,19 @@ def main():
                     if merged_df is not None:
                         st.write("### Preview of Merged Data")
-                        st.dataframe(merged_df.head())
                         # Download button for merged CSV
-                        csv = merged_df.to_csv(index=False)
-                        st.download_button(
-                            label="Download Merged CSV",
-                            data=csv,
-                            file_name="merged_data.csv",
-                            mime="text/csv"
-                        )
                         # Show statistics
                         st.write("### Dataset Statistics")
@@ -189,7 +254,10 @@ def main():
                         st.write("### Data Quality Metrics")
                         missing_values = merged_df.isnull().sum()
                         st.write("Missing values per column:")
-                        st.dataframe(missing_values)
                         # Show duplicate check
                         duplicates = merged_df.duplicated().sum()

 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
+def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """Clean DataFrame to ensure Arrow compatibility"""
+    for column in df.columns:
+        # Convert column name to string if it's not already
+        if not isinstance(column, str):
+            df.rename(columns={column: str(column)}, inplace=True)
+        # Handle mixed types in columns
+        if df[column].dtype == 'object':
+            # Try to convert to numeric, coerce errors to NaN
+            numeric_conversion = pd.to_numeric(df[column], errors='coerce')
+            if numeric_conversion.notna().any():
+                df[column] = numeric_conversion
+            else:
+                # If not numeric, ensure string type
+                df[column] = df[column].astype(str)
+        # Convert any remaining object types to string
+        if df[column].dtype == 'object':
+            df[column] = df[column].astype(str)
+        # Handle special cases for numeric columns
+        if pd.api.types.is_numeric_dtype(df[column]):
+            # Check if column contains large numbers that might cause overflow
+            if df[column].max() > 1e9 or df[column].min() < -1e9:
+                df[column] = df[column].astype('float64')
+        # Replace infinity values with NaN
+        if pd.api.types.is_numeric_dtype(df[column]):
+            df[column] = df[column].replace([np.inf, -np.inf], np.nan)
+    return df
 def convert_excel_to_csv(excel_file):
     """Convert Excel file to CSV and return the DataFrame"""
     try:
         df = pd.read_excel(excel_file)
+        return clean_dataframe(df)
     except Exception as e:
         st.error(f"Error converting Excel file: {str(e)}")
         return None
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     """Analyze DataFrame columns using Gemini AI"""
     # Convert sample of DataFrame to CSV string
+    sample_df = df.head(5).copy()
+    # Convert all columns to string for analysis
+    for col in sample_df.columns:
+        sample_df[col] = sample_df[col].astype(str)
+    sample_csv = sample_df.to_csv(index=False)
     analysis_prompt = f"""
     Analyze this CSV data from file '{filename}' and provide the following in JSON format:
     # Merge with remaining DataFrames
     for df_info in dataframes[1:]:
         try:
+            # Ensure common columns have the same type before merging
+            for col in common_columns:
+                if col in merged_df.columns and col in df_info['df'].columns:
+                    # Convert to string if types don't match
+                    if merged_df[col].dtype != df_info['df'][col].dtype:
+                        merged_df[col] = merged_df[col].astype(str)
+                        df_info['df'][col] = df_info['df'][col].astype(str)
             merged_df = pd.merge(
                 merged_df,
                 df_info['df'],
             st.error(f"Error merging {df_info['filename']}: {str(e)}")
             continue
+    return clean_dataframe(merged_df)
+def display_dataframe_sample(df: pd.DataFrame, title: str = "Data Preview"):
+    """Safely display a DataFrame sample in Streamlit"""
+    try:
+        st.write(title)
+        # Create a clean copy for display
+        display_df = df.head().copy()
+        # Convert all columns to string for safe display
+        for col in display_df.columns:
+            display_df[col] = display_df[col].astype(str)
+        st.dataframe(display_df)
+    except Exception as e:
+        st.error(f"Error displaying DataFrame: {str(e)}")
 def main():
     st.title("Smart CSV Processor")
         for uploaded_file in uploaded_files:
             st.write(f"#### Analyzing: {uploaded_file.name}")
+            try:
+                # Read file into DataFrame
+                if uploaded_file.name.endswith(('.xlsx', '.xls')):
+                    df = convert_excel_to_csv(uploaded_file)
+                else:
+                    df = pd.read_csv(uploaded_file)
+                    df = clean_dataframe(df)
+                if df is not None:
+                    # Show initial data preview
+                    display_dataframe_sample(df, "Initial Preview:")
+                    # Analyze columns using Gemini
+                    with st.spinner("Analyzing columns with AI..."):
+                        analysis = analyze_columns(df, uploaded_file.name)
+                    if analysis:
+                        st.write("Column Analysis:")
+                        st.json(analysis)
+                        # Store DataFrame and its analysis
+                        processed_files.append({
+                            'filename': uploaded_file.name,
+                            'df': df,
+                            'analysis': analysis
+                        })
+                        # Apply suggested column renames if any
+                        if 'suggested_renames' in analysis and analysis['suggested_renames']:
+                            df.rename(columns=analysis['suggested_renames'], inplace=True)
+                            st.write("Applied suggested column renames.")
+                            display_dataframe_sample(df, "Updated Preview:")
+            except Exception as e:
+                st.error(f"Error processing {uploaded_file.name}: {str(e)}")
+                continue
         if len(processed_files) > 1:
             st.write("### Merging DataFrames")
                     if merged_df is not None:
                         st.write("### Preview of Merged Data")
+                        display_dataframe_sample(merged_df)
                         # Download button for merged CSV
+                        try:
+                            csv = merged_df.to_csv(index=False)
+                            st.download_button(
+                                label="Download Merged CSV",
+                                data=csv,
+                                file_name="merged_data.csv",
+                                mime="text/csv"
+                            )
+                        except Exception as e:
+                            st.error(f"Error creating download file: {str(e)}")
                         # Show statistics
                         st.write("### Dataset Statistics")
                         st.write("### Data Quality Metrics")
                         missing_values = merged_df.isnull().sum()
                         st.write("Missing values per column:")
+                        st.dataframe(pd.DataFrame({
+                            'Column': missing_values.index,
+                            'Missing Values': missing_values.values
+                        }))
                         # Show duplicate check
                         duplicates = merged_df.duplicated().sum()