Spaces:

rairo
/

OneExcelZimraAI

Build error

App Files Files Community

rairo commited on Feb 20, 2025

Commit

3a17237

verified ·

1 Parent(s): 60414d5

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -29

app.py CHANGED Viewed

@@ -3,15 +3,13 @@ import pandas as pd
 import os
 from io import BytesIO
 import google.generativeai as genai
-from google.generativeai import types
-import pathlib
 from typing import List, Dict
 import json
 import tempfile
 # Initialize Google Gemini AI client
 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
-client = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def convert_excel_to_csv(excel_file):
     """Convert Excel file to CSV and return the DataFrame"""
@@ -27,32 +25,28 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
     # Convert sample of DataFrame to CSV string
     sample_csv = df.head(5).to_csv(index=False)
-    analysis_prompt = """
-    Analyze this CSV data and provide the following in JSON format:
-    1. Identify the main subject/entity of this dataset
-    2. List all columns and their likely content type (text, number, date, etc.)
-    3. Identify potential key columns that could be used for merging with other datasets
-    4. Flag any inconsistencies or data quality issues
-    5. Suggest any column renamings for clarity
-    Response format:
-    {
-        "subject": "string",
-        "columns": [{"name": "string", "type": "string", "description": "string"}],
-        "key_columns": ["string"],
-        "issues": ["string"],
-        "suggested_renames": {"old_name": "new_name"}
-    }
     """
     try:
-        response = client.generate_content(
-            contents=[
-                types.Part.from_text(f"Filename: {filename}\n\nCSV Data:\n{sample_csv}"),
-                analysis_prompt
-            ]
-        )
         # Parse JSON response
         analysis = json.loads(response.text)
         return analysis
@@ -126,8 +120,13 @@ def main():
                 df = pd.read_csv(uploaded_file)
             if df is not None:
                 # Analyze columns using Gemini
-                analysis = analyze_columns(df, uploaded_file.name)
                 if analysis:
                     st.write("Column Analysis:")
@@ -140,10 +139,12 @@ def main():
                         'analysis': analysis
                     })
-                    # Apply suggested column renames
-                    if 'suggested_renames' in analysis:
                         df.rename(columns=analysis['suggested_renames'], inplace=True)
                         st.write("Applied suggested column renames.")
         if len(processed_files) > 1:
             st.write("### Merging DataFrames")
@@ -163,7 +164,8 @@ def main():
                 if selected_columns:
                     # Merge DataFrames
-                    merged_df = merge_dataframes(processed_files, selected_columns)
                     if merged_df is not None:
                         st.write("### Preview of Merged Data")
@@ -188,6 +190,10 @@ def main():
                         missing_values = merged_df.isnull().sum()
                         st.write("Missing values per column:")
                         st.dataframe(missing_values)
             else:
                 st.warning("No common columns found across datasets.")
         else:

 import os
 from io import BytesIO
 import google.generativeai as genai
 from typing import List, Dict
 import json
 import tempfile
 # Initialize Google Gemini AI client
 genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
+model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def convert_excel_to_csv(excel_file):
     """Convert Excel file to CSV and return the DataFrame"""
     # Convert sample of DataFrame to CSV string
     sample_csv = df.head(5).to_csv(index=False)
+    analysis_prompt = f"""
+    Analyze this CSV data from file '{filename}' and provide the following in JSON format:
+    CSV Data:
+    {sample_csv}
+    Provide analysis in this exact JSON format:
+    {{
+        "subject": "string describing main subject of dataset",
+        "columns": [
+            {{"name": "column_name", "type": "data_type", "description": "column description"}}
+        ],
+        "key_columns": ["potential columns for merging"],
+        "issues": ["list of data quality issues found"],
+        "suggested_renames": {{"old_name": "new_name"}}
+    }}
+    Only respond with the JSON object, no additional text.
     """
     try:
+        response = model.generate_content(analysis_prompt)
         # Parse JSON response
         analysis = json.loads(response.text)
         return analysis
                 df = pd.read_csv(uploaded_file)
             if df is not None:
+                # Show initial data preview
+                st.write("Initial Preview:")
+                st.dataframe(df.head())
                 # Analyze columns using Gemini
+                with st.spinner("Analyzing columns with AI..."):
+                    analysis = analyze_columns(df, uploaded_file.name)
                 if analysis:
                     st.write("Column Analysis:")
                         'analysis': analysis
                     })
+                    # Apply suggested column renames if any
+                    if 'suggested_renames' in analysis and analysis['suggested_renames']:
                         df.rename(columns=analysis['suggested_renames'], inplace=True)
                         st.write("Applied suggested column renames.")
+                        st.write("Updated Preview:")
+                        st.dataframe(df.head())
         if len(processed_files) > 1:
             st.write("### Merging DataFrames")
                 if selected_columns:
                     # Merge DataFrames
+                    with st.spinner("Merging datasets..."):
+                        merged_df = merge_dataframes(processed_files, selected_columns)
                     if merged_df is not None:
                         st.write("### Preview of Merged Data")
                         missing_values = merged_df.isnull().sum()
                         st.write("Missing values per column:")
                         st.dataframe(missing_values)
+                        # Show duplicate check
+                        duplicates = merged_df.duplicated().sum()
+                        st.write(f"Number of duplicate rows: {duplicates}")
             else:
                 st.warning("No common columns found across datasets.")
         else: