Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

943ef38

verified ·

1 Parent(s): d78ab3e

Create app.py

Browse files

Files changed (1) hide show

app.py +198 -0

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import streamlit as st
+import pandas as pd
+import os
+from io import BytesIO
+from google import genai
+from google.genai import types
+import pathlib
+from typing import List, Dict
+import json
+import tempfile
+# Initialize Google Gemini AI client
+genai.configure(api_key=st.secrets["GOOGLE_API_KEY"])
+client = genai.Client()
+def convert_excel_to_csv(excel_file):
+    """Convert Excel file to CSV and return the DataFrame"""
+    try:
+        df = pd.read_excel(excel_file)
+        return df
+    except Exception as e:
+        st.error(f"Error converting Excel file: {str(e)}")
+        return None
+def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
+    """Analyze DataFrame columns using Gemini AI"""
+    # Convert sample of DataFrame to CSV string
+    sample_csv = df.head(5).to_csv(index=False)
+    analysis_prompt = """
+    Analyze this CSV data and provide the following in JSON format:
+    1. Identify the main subject/entity of this dataset
+    2. List all columns and their likely content type (text, number, date, etc.)
+    3. Identify potential key columns that could be used for merging with other datasets
+    4. Flag any inconsistencies or data quality issues
+    5. Suggest any column renamings for clarity
+    Response format:
+    {
+        "subject": "string",
+        "columns": [{"name": "string", "type": "string", "description": "string"}],
+        "key_columns": ["string"],
+        "issues": ["string"],
+        "suggested_renames": {"old_name": "new_name"}
+    }
+    """
+    try:
+        response = client.models.generate_content(
+            model="gemini-2.0-flash-thinking-exp",
+            contents=[
+                types.Part.from_text(f"Filename: {filename}\n\nCSV Data:\n{sample_csv}"),
+                analysis_prompt
+            ]
+        )
+        # Parse JSON response
+        analysis = json.loads(response.text)
+        return analysis
+    except Exception as e:
+        st.error(f"Error analyzing columns: {str(e)}")
+        return None
+def find_common_columns(dataframes: List[Dict]) -> List[str]:
+    """Find potential common columns across all DataFrames based on Gemini analysis"""
+    all_key_columns = []
+    for df_info in dataframes:
+        if df_info['analysis'] and 'key_columns' in df_info['analysis']:
+            all_key_columns.extend(df_info['analysis']['key_columns'])
+    # Count frequency of each column
+    from collections import Counter
+    column_freq = Counter(all_key_columns)
+    # Return columns that appear in multiple datasets
+    common_columns = [col for col, freq in column_freq.items() if freq > 1]
+    return common_columns
+def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.DataFrame:
+    """Merge all DataFrames using specified common columns"""
+    if not dataframes:
+        return None
+    # Start with the first DataFrame
+    merged_df = dataframes[0]['df'].copy()
+    # Merge with remaining DataFrames
+    for df_info in dataframes[1:]:
+        try:
+            merged_df = pd.merge(
+                merged_df,
+                df_info['df'],
+                on=common_columns,
+                how='outer',
+                suffixes=(None, f'_{df_info["filename"]}')
+            )
+        except Exception as e:
+            st.error(f"Error merging {df_info['filename']}: {str(e)}")
+            continue
+    return merged_df
+def main():
+    st.title("Smart CSV Processor")
+    st.write("Upload CSV or Excel files for intelligent analysis and merging.")
+    # File uploader
+    uploaded_files = st.file_uploader(
+        "Choose files",
+        accept_multiple_files=True,
+        type=['csv', 'xlsx', 'xls']
+    )
+    if uploaded_files:
+        st.write("### Processing Files")
+        # Process each file and store DataFrames with their analysis
+        processed_files = []
+        for uploaded_file in uploaded_files:
+            st.write(f"#### Analyzing: {uploaded_file.name}")
+            # Read file into DataFrame
+            if uploaded_file.name.endswith(('.xlsx', '.xls')):
+                df = convert_excel_to_csv(uploaded_file)
+            else:
+                df = pd.read_csv(uploaded_file)
+            if df is not None:
+                # Analyze columns using Gemini
+                analysis = analyze_columns(df, uploaded_file.name)
+                if analysis:
+                    st.write("Column Analysis:")
+                    st.json(analysis)
+                    # Store DataFrame and its analysis
+                    processed_files.append({
+                        'filename': uploaded_file.name,
+                        'df': df,
+                        'analysis': analysis
+                    })
+                    # Apply suggested column renames
+                    if 'suggested_renames' in analysis:
+                        df.rename(columns=analysis['suggested_renames'], inplace=True)
+                        st.write("Applied suggested column renames.")
+        if len(processed_files) > 1:
+            st.write("### Merging DataFrames")
+            # Find common columns
+            common_columns = find_common_columns(processed_files)
+            if common_columns:
+                st.write("Detected common columns:", common_columns)
+                # Let user select columns to use for merging
+                selected_columns = st.multiselect(
+                    "Select columns to use for merging",
+                    options=common_columns,
+                    default=common_columns
+                )
+                if selected_columns:
+                    # Merge DataFrames
+                    merged_df = merge_dataframes(processed_files, selected_columns)
+                    if merged_df is not None:
+                        st.write("### Preview of Merged Data")
+                        st.dataframe(merged_df.head())
+                        # Download button for merged CSV
+                        csv = merged_df.to_csv(index=False)
+                        st.download_button(
+                            label="Download Merged CSV",
+                            data=csv,
+                            file_name="merged_data.csv",
+                            mime="text/csv"
+                        )
+                        # Show statistics
+                        st.write("### Dataset Statistics")
+                        st.write(f"Total rows: {len(merged_df)}")
+                        st.write(f"Total columns: {len(merged_df.columns)}")
+                        # Show data quality metrics
+                        st.write("### Data Quality Metrics")
+                        missing_values = merged_df.isnull().sum()
+                        st.write("Missing values per column:")
+                        st.dataframe(missing_values)
+            else:
+                st.warning("No common columns found across datasets.")
+        else:
+            st.warning("Please upload at least 2 files to merge.")
+if __name__ == "__main__":
+    main()