Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

68a00f6

verified ·

1 Parent(s): fddc41f

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -194

app.py CHANGED Viewed

@@ -17,7 +17,9 @@ def clean_column_name(col_name):
     """Clean column names to be compatible with Arrow"""
     if not isinstance(col_name, str):
         return str(col_name)
-    return col_name.strip()
 def is_salary_column(column_name: str) -> bool:
     """Check if column name suggests it contains salary/monetary data"""
@@ -32,9 +34,7 @@ def clean_monetary_value(value):
     if isinstance(value, (int, float)):
         return float(value)
-    # Convert to string if not already
     value_str = str(value)
     # Remove currency symbols, commas, and other non-numeric characters except decimal points
     cleaned = re.sub(r'[^0-9.-]', '', value_str)
@@ -43,132 +43,92 @@ def clean_monetary_value(value):
     except (ValueError, TypeError):
         return np.nan
-def safe_convert_column(df: pd.DataFrame, column: str) -> pd.Series:
-    """Safely convert a column to the appropriate type"""
-    series = df[column].copy()
-    # Handle salary/monetary columns
-    if is_salary_column(column):
-        return series.apply(clean_monetary_value)
-    # Try numeric conversion first
-    numeric_series = pd.to_numeric(series, errors='coerce')
-    if numeric_series.notna().any():
-        return numeric_series
-    # If not numeric, convert to string
-    return series.astype(str)
-def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    """Clean DataFrame to ensure Arrow compatibility"""
-    # Create a copy to avoid modifying the original
-    cleaned_df = df.copy()
-    # Clean column names
-    cleaned_df.columns = [clean_column_name(col) for col in cleaned_df.columns]
-    # Process each column
-    for column in cleaned_df.columns:
-        try:
-            cleaned_df[column] = safe_convert_column(cleaned_df, column)
-        except Exception as e:
-            st.warning(f"Error processing column {column}: {str(e)}")
-            # Fallback to string conversion
-            cleaned_df[column] = cleaned_df[column].astype(str)
-    return cleaned_df
-def safe_display_df(df: pd.DataFrame) -> pd.DataFrame:
-    """Prepare DataFrame for safe display in Streamlit"""
-    display_df = df.copy()
-    # Convert all columns to string for display
-    for col in display_df.columns:
-        try:
-            if is_salary_column(col):
-                # Format monetary values
-                display_df[col] = display_df[col].apply(lambda x: f"${x:,.2f}" if pd.notna(x) else "")
-            else:
-                # Convert other columns to string
-                display_df[col] = display_df[col].astype(str).apply(lambda x: "" if x == "nan" else x)
-        except Exception as e:
-            display_df[col] = display_df[col].astype(str)
-    return display_df
-def convert_excel_to_csv(excel_file):
-    """Convert Excel file to CSV and return the DataFrame"""
-    try:
-        df = pd.read_excel(excel_file)
-        return clean_dataframe(df)
-    except Exception as e:
-        st.error(f"Error converting Excel file: {str(e)}")
-        return None
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
-    """Analyze DataFrame columns using Gemini AI"""
-    # Prepare sample data for analysis
-    display_df = safe_display_df(df.head(5))
-    sample_csv = display_df.to_csv(index=False)
-    analysis_prompt = f"""
-    Analyze this CSV data from file '{filename}' and provide the following in JSON format:
-    CSV Data:
-    {sample_csv}
-    Provide analysis in this exact JSON format:
-    {{
-        "subject": "string describing main subject of dataset",
-        "columns": [
-            {{"name": "column_name", "type": "data_type", "description": "column description"}}
-        ],
-        "key_columns": ["potential columns for merging"],
-        "issues": ["list of data quality issues found"],
-        "suggested_renames": {{"old_name": "new_name"}}
-    }}
-    Only respond with the JSON object, no additional text.
-    """
     try:
-        response = model.generate_content(analysis_prompt)
-        return json.loads(response.text)
-    except Exception as e:
-        st.error(f"Error analyzing columns: {str(e)}")
-        return None
-def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.DataFrame:
-    """Merge all DataFrames using specified common columns"""
-    if not dataframes:
-        return None
-    try:
-        # Start with the first DataFrame
-        merged_df = dataframes[0]['df'].copy()
-        # Merge with remaining DataFrames
-        for df_info in dataframes[1:]:
-            # Ensure common columns have matching types
-            for col in common_columns:
-                if col in merged_df.columns and col in df_info['df'].columns:
-                    # Convert to string if types don't match
-                    if merged_df[col].dtype != df_info['df'][col].dtype:
-                        merged_df[col] = merged_df[col].astype(str)
-                        df_info['df'][col] = df_info['df'][col].astype(str)
-            merged_df = pd.merge(
-                merged_df,
-                df_info['df'],
-                on=common_columns,
-                how='outer',
-                suffixes=(None, f'_{df_info["filename"]}')
-            )
-        return clean_dataframe(merged_df)
     except Exception as e:
-        st.error(f"Error merging DataFrames: {str(e)}")
-        return None
 def main():
     st.title("Smart CSV Processor")
@@ -188,22 +148,24 @@ def main():
             st.write(f"#### Analyzing: {uploaded_file.name}")
             try:
-                # Read and clean data
                 if uploaded_file.name.endswith(('.xlsx', '.xls')):
-                    df = convert_excel_to_csv(uploaded_file)
                 else:
                     df = pd.read_csv(uploaded_file)
-                    df = clean_dataframe(df)
                 if df is not None:
                     # Show initial data preview
                     st.write("Initial Preview:")
-                    st.dataframe(safe_display_df(df.head()))
-                    # Analyze columns
-                    with st.spinner("Analyzing columns with AI..."):
                         analysis = analyze_columns(df, uploaded_file.name)
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
@@ -214,73 +176,11 @@ def main():
                             'analysis': analysis
                         })
-                        if 'suggested_renames' in analysis and analysis['suggested_renames']:
-                            df.rename(columns=analysis['suggested_renames'], inplace=True)
-                            st.write("Updated Preview (after renaming):")
-                            st.dataframe(safe_display_df(df.head()))
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                 continue
-        if len(processed_files) > 1:
-            st.write("### Merging DataFrames")
-            # Find common columns
-            common_columns = list(set.intersection(*[
-                set(df_info['df'].columns) for df_info in processed_files
-            ]))
-            if common_columns:
-                st.write("Common columns found:", common_columns)
-                selected_columns = st.multiselect(
-                    "Select columns to use for merging",
-                    options=common_columns,
-                    default=common_columns
-                )
-                if selected_columns:
-                    with st.spinner("Merging datasets..."):
-                        merged_df = merge_dataframes(processed_files, selected_columns)
-                    if merged_df is not None:
-                        st.write("### Preview of Merged Data")
-                        st.dataframe(safe_display_df(merged_df.head()))
-                        # Create downloadable CSV
-                        try:
-                            csv = merged_df.to_csv(index=False)
-                            st.download_button(
-                                label="Download Merged CSV",
-                                data=csv,
-                                file_name="merged_data.csv",
-                                mime="text/csv"
-                            )
-                            # Show statistics
-                            st.write("### Dataset Statistics")
-                            st.write(f"Total rows: {len(merged_df)}")
-                            st.write(f"Total columns: {len(merged_df.columns)}")
-                            # Data quality metrics
-                            st.write("### Data Quality Metrics")
-                            missing_df = pd.DataFrame({
-                                'Column': merged_df.columns,
-                                'Missing Values': merged_df.isnull().sum().values,
-                                'Missing Percentage': (merged_df.isnull().sum().values / len(merged_df) * 100).round(2)
-                            })
-                            st.dataframe(missing_df)
-                            duplicates = merged_df.duplicated().sum()
-                            st.write(f"Number of duplicate rows: {duplicates}")
-                        except Exception as e:
-                            st.error(f"Error preparing download: {str(e)}")
-            else:
-                st.warning("No common columns found across datasets.")
-        else:
-            st.warning("Please upload at least 2 files to merge.")
 if __name__ == "__main__":
     main()

     """Clean column names to be compatible with Arrow"""
     if not isinstance(col_name, str):
         return str(col_name)
+    # Remove special characters and extra spaces
+    cleaned = re.sub(r'[^\w\s]', ' ', col_name)
+    return re.sub(r'\s+', '_', cleaned.strip().lower())
 def is_salary_column(column_name: str) -> bool:
     """Check if column name suggests it contains salary/monetary data"""
     if isinstance(value, (int, float)):
         return float(value)
     value_str = str(value)
     # Remove currency symbols, commas, and other non-numeric characters except decimal points
     cleaned = re.sub(r'[^0-9.-]', '', value_str)
     except (ValueError, TypeError):
         return np.nan
 def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
+    """Analyze DataFrame columns using Gemini AI with improved error handling"""
     try:
+        # Prepare sample data for analysis
+        display_df = df.head(5).copy()
+        # Convert all columns to string for display
+        for col in display_df.columns:
+            display_df[col] = display_df[col].astype(str)
+        sample_csv = display_df.to_csv(index=False)
+        # Create a more structured prompt
+        prompt = f"""
+        Analyze this CSV data and provide analysis in JSON format.
+        Filename: {filename}
+        Sample data:
+        {sample_csv}
+        Respond with only a valid JSON object in this format:
+        {{
+            "subject": "Employee payroll data",
+            "columns": [
+                {{
+                    "name": "column_name",
+                    "type": "string/number/date",
+                    "description": "Brief description"
+                }}
+            ],
+            "key_columns": ["employee_id", "tin"],
+            "issues": ["Missing values in salary column"],
+            "suggested_renames": {{
+                "old_name": "new_name"
+            }}
+        }}
+        """
+        response = model.generate_content(prompt)
+        response_text = response.text.strip()
+        # Handle potential markdown code block
+        if response_text.startswith('```json'):
+            response_text = response_text[7:-3]  # Remove ```json and ```
+        elif response_text.startswith('```'):
+            response_text = response_text[3:-3]  # Remove ``` and ```
+        response_text = response_text.strip()
+        try:
+            analysis = json.loads(response_text)
+            return analysis
+        except json.JSONDecodeError as je:
+            st.error(f"JSON parsing error: {str(je)}")
+            st.text("Raw response:")
+            st.text(response_text)
+            return {
+                "subject": "Error parsing analysis",
+                "columns": [],
+                "key_columns": [],
+                "issues": ["Error analyzing columns"],
+                "suggested_renames": {}
+            }
     except Exception as e:
+        st.error(f"Error in column analysis: {str(e)}")
+        return {
+            "subject": "Error in analysis",
+            "columns": [],
+            "key_columns": [],
+            "issues": [str(e)],
+            "suggested_renames": {}
+        }
+def read_excel_file(file) -> pd.DataFrame:
+    """Read Excel file with improved error handling"""
+    try:
+        # Try reading with default engine
+        return pd.read_excel(file, engine='openpyxl')
+    except Exception as e1:
+        try:
+            # Fallback to xlrd engine for older Excel files
+            return pd.read_excel(file, engine='xlrd')
+        except Exception as e2:
+            st.error(f"Failed to read Excel file: {str(e2)}")
+            return None
 def main():
     st.title("Smart CSV Processor")
             st.write(f"#### Analyzing: {uploaded_file.name}")
             try:
+                # Read the file
                 if uploaded_file.name.endswith(('.xlsx', '.xls')):
+                    df = read_excel_file(uploaded_file)
                 else:
                     df = pd.read_csv(uploaded_file)
                 if df is not None:
+                    # Clean column names
+                    df.columns = [clean_column_name(col) for col in df.columns]
                     # Show initial data preview
                     st.write("Initial Preview:")
+                    st.dataframe(df.head())
+                    # Analyze columns with improved error handling
+                    with st.spinner("Analyzing columns..."):
                         analysis = analyze_columns(df, uploaded_file.name)
                     if analysis:
                         st.write("Column Analysis:")
                         st.json(analysis)
                             'analysis': analysis
                         })
             except Exception as e:
                 st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                 continue
+        # Rest of the merging logic remains the same...
 if __name__ == "__main__":
     main()