Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

91d26e7

verified ·

1 Parent(s): 6250aa7

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -111

app.py CHANGED Viewed

@@ -11,38 +11,88 @@ import tempfile
 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """Clean DataFrame to ensure Arrow compatibility"""
-    for column in df.columns:
-        # Convert column name to string if it's not already
-        if not isinstance(column, str):
-            df.rename(columns={column: str(column)}, inplace=True)
-        # Handle mixed types in columns
-        if df[column].dtype == 'object':
-            # Try to convert to numeric, coerce errors to NaN
-            numeric_conversion = pd.to_numeric(df[column], errors='coerce')
-            if numeric_conversion.notna().any():
-                df[column] = numeric_conversion
             else:
-                # If not numeric, ensure string type
-                df[column] = df[column].astype(str)
-        # Convert any remaining object types to string
-        if df[column].dtype == 'object':
-            df[column] = df[column].astype(str)
-        # Handle special cases for numeric columns
-        if pd.api.types.is_numeric_dtype(df[column]):
-            # Check if column contains large numbers that might cause overflow
-            if df[column].max() > 1e9 or df[column].min() < -1e9:
-                df[column] = df[column].astype('float64')
-        # Replace infinity values with NaN
-        if pd.api.types.is_numeric_dtype(df[column]):
-            df[column] = df[column].replace([np.inf, -np.inf], np.nan)
-    return df
 def convert_excel_to_csv(excel_file):
     """Convert Excel file to CSV and return the DataFrame"""
@@ -55,12 +105,9 @@ def convert_excel_to_csv(excel_file):
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     """Analyze DataFrame columns using Gemini AI"""
-    # Convert sample of DataFrame to CSV string
-    sample_df = df.head(5).copy()
-    # Convert all columns to string for analysis
-    for col in sample_df.columns:
-        sample_df[col] = sample_df[col].astype(str)
-    sample_csv = sample_df.to_csv(index=False)
     analysis_prompt = f"""
     Analyze this CSV data from file '{filename}' and provide the following in JSON format:
@@ -84,40 +131,23 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     try:
         response = model.generate_content(analysis_prompt)
-        # Parse JSON response
-        analysis = json.loads(response.text)
-        return analysis
     except Exception as e:
         st.error(f"Error analyzing columns: {str(e)}")
         return None
-def find_common_columns(dataframes: List[Dict]) -> List[str]:
-    """Find potential common columns across all DataFrames based on Gemini analysis"""
-    all_key_columns = []
-    for df_info in dataframes:
-        if df_info['analysis'] and 'key_columns' in df_info['analysis']:
-            all_key_columns.extend(df_info['analysis']['key_columns'])
-    # Count frequency of each column
-    from collections import Counter
-    column_freq = Counter(all_key_columns)
-    # Return columns that appear in multiple datasets
-    common_columns = [col for col, freq in column_freq.items() if freq > 1]
-    return common_columns
 def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.DataFrame:
     """Merge all DataFrames using specified common columns"""
     if not dataframes:
         return None
-    # Start with the first DataFrame
-    merged_df = dataframes[0]['df'].copy()
-    # Merge with remaining DataFrames
-    for df_info in dataframes[1:]:
-        try:
-            # Ensure common columns have the same type before merging
             for col in common_columns:
                 if col in merged_df.columns and col in df_info['df'].columns:
                     # Convert to string if types don't match
@@ -132,30 +162,16 @@ def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.Da
                 how='outer',
                 suffixes=(None, f'_{df_info["filename"]}')
             )
-        except Exception as e:
-            st.error(f"Error merging {df_info['filename']}: {str(e)}")
-            continue
-    return clean_dataframe(merged_df)
-def display_dataframe_sample(df: pd.DataFrame, title: str = "Data Preview"):
-    """Safely display a DataFrame sample in Streamlit"""
-    try:
-        st.write(title)
-        # Create a clean copy for display
-        display_df = df.head().copy()
-        # Convert all columns to string for safe display
-        for col in display_df.columns:
-            display_df[col] = display_df[col].astype(str)
-        st.dataframe(display_df)
     except Exception as e:
-        st.error(f"Error displaying DataFrame: {str(e)}")
 def main():
     st.title("Smart CSV Processor")
     st.write("Upload CSV or Excel files for intelligent analysis and merging.")
-    # File uploader
     uploaded_files = st.file_uploader(
         "Choose files",
         accept_multiple_files=True,
@@ -164,15 +180,13 @@ def main():
     if uploaded_files:
         st.write("### Processing Files")
-        # Process each file and store DataFrames with their analysis
         processed_files = []
         for uploaded_file in uploaded_files:
             st.write(f"#### Analyzing: {uploaded_file.name}")
             try:
-                # Read file into DataFrame
                 if uploaded_file.name.endswith(('.xlsx', '.xls')):
                     df = convert_excel_to_csv(uploaded_file)
                 else:
@@ -181,9 +195,10 @@ def main():
                 if df is not None:
                     # Show initial data preview
-                    display_dataframe_sample(df, "Initial Preview:")
-                    # Analyze columns using Gemini
                     with st.spinner("Analyzing columns with AI..."):
                         analysis = analyze_columns(df, uploaded_file.name)
@@ -191,18 +206,16 @@ def main():
                         st.write("Column Analysis:")
                         st.json(analysis)
-                        # Store DataFrame and its analysis
                         processed_files.append({
                             'filename': uploaded_file.name,
                             'df': df,
                             'analysis': analysis
                         })
-                        # Apply suggested column renames if any
                         if 'suggested_renames' in analysis and analysis['suggested_renames']:
                             df.rename(columns=analysis['suggested_renames'], inplace=True)
-                            st.write("Applied suggested column renames.")
-                            display_dataframe_sample(df, "Updated Preview:")
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
@@ -212,12 +225,13 @@ def main():
             st.write("### Merging DataFrames")
             # Find common columns
-            common_columns = find_common_columns(processed_files)
             if common_columns:
-                st.write("Detected common columns:", common_columns)
-                # Let user select columns to use for merging
                 selected_columns = st.multiselect(
                     "Select columns to use for merging",
                     options=common_columns,
@@ -225,15 +239,14 @@ def main():
                 )
                 if selected_columns:
-                    # Merge DataFrames
                     with st.spinner("Merging datasets..."):
                         merged_df = merge_dataframes(processed_files, selected_columns)
                     if merged_df is not None:
                         st.write("### Preview of Merged Data")
-                        display_dataframe_sample(merged_df)
-                        # Download button for merged CSV
                         try:
                             csv = merged_df.to_csv(index=False)
                             st.download_button(
@@ -242,26 +255,26 @@ def main():
                                 file_name="merged_data.csv",
                                 mime="text/csv"
                             )
                         except Exception as e:
-                            st.error(f"Error creating download file: {str(e)}")
-                        # Show statistics
-                        st.write("### Dataset Statistics")
-                        st.write(f"Total rows: {len(merged_df)}")
-                        st.write(f"Total columns: {len(merged_df.columns)}")
-                        # Show data quality metrics
-                        st.write("### Data Quality Metrics")
-                        missing_values = merged_df.isnull().sum()
-                        st.write("Missing values per column:")
-                        st.dataframe(pd.DataFrame({
-                            'Column': missing_values.index,
-                            'Missing Values': missing_values.values
-                        }))
-                        # Show duplicate check
-                        duplicates = merged_df.duplicated().sum()
-                        st.write(f"Number of duplicate rows: {duplicates}")
             else:
                 st.warning("No common columns found across datasets.")
         else:

 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
 model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
+def clean_column_name(col_name):
+    """Clean column names to be compatible with Arrow"""
+    if not isinstance(col_name, str):
+        return str(col_name)
+    return col_name.strip()
+def is_salary_column(column_name: str) -> bool:
+    """Check if column name suggests it contains salary/monetary data"""
+    salary_keywords = ['salary', 'wage', 'income', 'earning', 'commission', 'fee', 'payment', 'compensation']
+    column_lower = column_name.lower()
+    return any(keyword in column_lower for keyword in salary_keywords)
+def clean_monetary_value(value):
+    """Clean monetary values by removing currency symbols and converting to float"""
+    if pd.isna(value):
+        return np.nan
+    if isinstance(value, (int, float)):
+        return float(value)
+    # Convert to string if not already
+    value_str = str(value)
+    # Remove currency symbols, commas, and other non-numeric characters except decimal points
+    cleaned = re.sub(r'[^0-9.-]', '', value_str)
+    try:
+        return float(cleaned)
+    except (ValueError, TypeError):
+        return np.nan
+def safe_convert_column(df: pd.DataFrame, column: str) -> pd.Series:
+    """Safely convert a column to the appropriate type"""
+    series = df[column].copy()
+    # Handle salary/monetary columns
+    if is_salary_column(column):
+        return series.apply(clean_monetary_value)
+    # Try numeric conversion first
+    numeric_series = pd.to_numeric(series, errors='coerce')
+    if numeric_series.notna().any():
+        return numeric_series
+    # If not numeric, convert to string
+    return series.astype(str)
 def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """Clean DataFrame to ensure Arrow compatibility"""
+    # Create a copy to avoid modifying the original
+    cleaned_df = df.copy()
+    # Clean column names
+    cleaned_df.columns = [clean_column_name(col) for col in cleaned_df.columns]
+    # Process each column
+    for column in cleaned_df.columns:
+        try:
+            cleaned_df[column] = safe_convert_column(cleaned_df, column)
+        except Exception as e:
+            st.warning(f"Error processing column {column}: {str(e)}")
+            # Fallback to string conversion
+            cleaned_df[column] = cleaned_df[column].astype(str)
+    return cleaned_df
+def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
+    """Prepare DataFrame for safe display in Streamlit"""
+    display_df = df.copy()
+    # Convert all columns to string for display
+    for col in display_df.columns:
+        try:
+            if is_salary_column(col):
+                # Format monetary values
+                display_df[col] = display_df[col].apply(lambda x: f"${x:,.2f}" if pd.notna(x) else "")
             else:
+                # Convert other columns to string
+                display_df[col] = display_df[col].astype(str).apply(lambda x: "" if x == "nan" else x)
+        except Exception as e:
+            display_df[col] = display_df[col].astype(str)
+    return display_df
 def convert_excel_to_csv(excel_file):
     """Convert Excel file to CSV and return the DataFrame"""
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     """Analyze DataFrame columns using Gemini AI"""
+    # Prepare sample data for analysis
+    display_df = safe_display_df(df.head(5))
+    sample_csv = display_df.to_csv(index=False)
     analysis_prompt = f"""
     Analyze this CSV data from file '{filename}' and provide the following in JSON format:
     try:
         response = model.generate_content(analysis_prompt)
+        return json.loads(response.text)
     except Exception as e:
         st.error(f"Error analyzing columns: {str(e)}")
         return None
 def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.DataFrame:
     """Merge all DataFrames using specified common columns"""
     if not dataframes:
         return None
+    try:
+        # Start with the first DataFrame
+        merged_df = dataframes[0]['df'].copy()
+        # Merge with remaining DataFrames
+        for df_info in dataframes[1:]:
+            # Ensure common columns have matching types
             for col in common_columns:
                 if col in merged_df.columns and col in df_info['df'].columns:
                     # Convert to string if types don't match
                 how='outer',
                 suffixes=(None, f'_{df_info["filename"]}')
             )
+        return clean_dataframe(merged_df)
     except Exception as e:
+        st.error(f"Error merging DataFrames: {str(e)}")
+        return None
 def main():
     st.title("Smart CSV Processor")
     st.write("Upload CSV or Excel files for intelligent analysis and merging.")
     uploaded_files = st.file_uploader(
         "Choose files",
         accept_multiple_files=True,
     if uploaded_files:
         st.write("### Processing Files")
         processed_files = []
         for uploaded_file in uploaded_files:
             st.write(f"#### Analyzing: {uploaded_file.name}")
             try:
+                # Read and clean data
                 if uploaded_file.name.endswith(('.xlsx', '.xls')):
                     df = convert_excel_to_csv(uploaded_file)
                 else:
                 if df is not None:
                     # Show initial data preview
+                    st.write("Initial Preview:")
+                    st.dataframe(safe_display_df(df.head()))
+                    # Analyze columns
                     with st.spinner("Analyzing columns with AI..."):
                         analysis = analyze_columns(df, uploaded_file.name)
                         st.write("Column Analysis:")
                         st.json(analysis)
                         processed_files.append({
                             'filename': uploaded_file.name,
                             'df': df,
                             'analysis': analysis
                         })
                         if 'suggested_renames' in analysis and analysis['suggested_renames']:
                             df.rename(columns=analysis['suggested_renames'], inplace=True)
+                            st.write("Updated Preview (after renaming):")
+                            st.dataframe(safe_display_df(df.head()))
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
             st.write("### Merging DataFrames")
             # Find common columns
+            common_columns = list(set.intersection(*[
+                set(df_info['df'].columns) for df_info in processed_files
+            ]))
             if common_columns:
+                st.write("Common columns found:", common_columns)
                 selected_columns = st.multiselect(
                     "Select columns to use for merging",
                     options=common_columns,
                 )
                 if selected_columns:
                     with st.spinner("Merging datasets..."):
                         merged_df = merge_dataframes(processed_files, selected_columns)
                     if merged_df is not None:
                         st.write("### Preview of Merged Data")
+                        st.dataframe(safe_display_df(merged_df.head()))
+                        # Create downloadable CSV
                         try:
                             csv = merged_df.to_csv(index=False)
                             st.download_button(
                                 file_name="merged_data.csv",
                                 mime="text/csv"
                             )
+                            # Show statistics
+                            st.write("### Dataset Statistics")
+                            st.write(f"Total rows: {len(merged_df)}")
+                            st.write(f"Total columns: {len(merged_df.columns)}")
+                            # Data quality metrics
+                            st.write("### Data Quality Metrics")
+                            missing_df = pd.DataFrame({
+                                'Column': merged_df.columns,
+                                'Missing Values': merged_df.isnull().sum().values,
+                                'Missing Percentage': (merged_df.isnull().sum().values / len(merged_df) * 100).round(2)
+                            })
+                            st.dataframe(missing_df)
+                            duplicates = merged_df.duplicated().sum()
+                            st.write(f"Number of duplicate rows: {duplicates}")
                         except Exception as e:
+                            st.error(f"Error preparing download: {str(e)}")
             else:
                 st.warning("No common columns found across datasets.")
         else: