Spaces:

sanjaystarc
/

data-analysis-agent

Sleeping

File size: 8,298 Bytes

import os
import streamlit as st
import pandas as pd
import numpy as np
import requests
import json
import time 
import matplotlib.pyplot as plt 
import seaborn as sns 

# --- CONFIG ---
# Note: GEMINI_API_KEY is retrieved from environment variables/secrets.
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    st.error("❌ Missing Gemini API key. Add it as a secret: GEMINI_API_KEY")
    st.stop()

# Define API endpoints and models
GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
# Using the correct model for structured output
CHAT_MODEL = "gemini-2.5-flash-preview-09-2025" 
EMBED_MODEL = "models/embedding-001" 

# Define the JSON schema for structured output
ANALYSIS_SCHEMA = {
    "type": "OBJECT",
    "properties": {
        "reasoning": {
            "type": "STRING",
            "description": "A detailed natural language explanation of the analysis, including key findings and context."
        },
        "code": {
            "type": "STRING",
            "description": "The complete, runnable Python code using pandas (df) and streamlit (st). Use st.pyplot() for plots, and st.dataframe() for resulting DataFrames. If no code is needed, this should be an empty string."
        }
    }
}

SYSTEM_INSTRUCTION = (
    "You are a world-class Data Analyst Agent. Your task is to analyze the provided DataFrame ('df') "
    "based on the user's question. You MUST respond with a single JSON object conforming to the provided schema. "
    "1. **Reasoning:** Explain your plan, the steps taken, and the insights derived from the data. Format this in Markdown. "
    "2. **Code:** If the question requires calculation, aggregation, or visualization, you MUST generate Python code to execute against the 'df' DataFrame. "
    "   - The DataFrame is already loaded as a variable named 'df'. Do NOT redefine it. "
    "   - Use Streamlit functions for simple outputs: `st.dataframe(...)`, `st.bar_chart()`, `st.line_chart()`. "
    "   - For **ALL** custom, complex plots, you MUST follow this strict Matplotlib sequence: **Start with `plt.figure()`, use `plt.` or `sns.` commands for plotting, and explicitly end with `st.pyplot(plt)`** to display the output. "
    "   - **CRITICAL GUARDRAIL:** When generating code that uses logical conditions (e.g., in `if` statements or for complex filters) on Pandas Series or NumPy arrays, you **MUST** resolve ambiguity by using `.any()` or `.all()`. Do NOT compare a series directly to a single boolean value."
    "   - Ensure the code is self-contained and ready to execute."
)

# --- Helper Functions ---

def chat_with_gemini(prompt, context):
    """Sends a prompt and data context to the Gemini model for structured analysis (reasoning + code)."""
    
    # Correctly prepend 'models/' to the model name in the URL path
    url = f"{GEMINI_BASE}/models/{CHAT_MODEL}:generateContent?key={GEMINI_API_KEY}"
    
    # Construct the full prompt including the data context
    full_prompt = f"Data Context (DataFrame Head and Columns):\n{context}\n\nUser Question: {prompt}"
    
    payload = {
        "contents": [
            {"parts": [{"text": full_prompt}]}
        ],
        "systemInstruction": {"parts": [{"text": SYSTEM_INSTRUCTION}]},
        "generationConfig": {
            "responseMimeType": "application/json",
            "responseSchema": ANALYSIS_SCHEMA
        }
    }
    
    max_retries = 5
    delay = 1
    for attempt in range(max_retries):
        try:
            r = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(payload))
            r.raise_for_status() 
            data = r.json()
            
            json_str = data["candidates"][0]["content"]["parts"][0]["text"]
            return json.loads(json_str)

        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                time.sleep(delay)
                delay *= 2
            else:
                st.error(f"API Request Failed: {e}")
                raise e
        except Exception as e:
            st.error(f"Failed to parse model response or execute operation: {e}")
            raise e

# --- UI ---
st.title("✨Data Analyst Agent (Code Execution Enabled)")
st.write("Upload a CSV file and ask natural language questions. The agent now generates and executes Python code to provide precise data analysis and visualizations.")

# State variable to hold the DataFrame, initialized once
if 'df' not in st.session_state:
    st.session_state.df = pd.DataFrame()

uploaded = st.file_uploader("Upload CSV", type=["csv"])

if uploaded:
    # Use st.cache_data to avoid reloading the file multiple times
    @st.cache_data
    def load_data(file):
        try:
            return pd.read_csv(file)
        except Exception as e:
            st.error(f"Failed to load CSV: {e}")
            return pd.DataFrame()
            
    st.session_state.df = load_data(uploaded)

    if not st.session_state.df.empty:
        st.subheader("Data Preview (First 5 Rows)")
        st.dataframe(st.session_state.df.head())
    
        question = st.text_area("Ask a complex question or request a visualization (e.g., 'Show the average of the 'Sales' column', 'Plot the distribution of 'Age'):")
    
        if st.button("Analyze & Execute") and question:
            df = st.session_state.df # Local variable for code execution context
            
            # Summarize dataset for context sent to the LLM
            context = f"Dataset Columns: {', '.join(df.columns.astype(str))}\n\nFirst 5 rows of data:\n{df.head(5).to_string(index=False)}"

            st.markdown("---")
            st.subheader("🤖 Analysis Steps")
            
            with st.spinner("1. Generating analysis plan and code..."):
                try:
                    # 1. Get structured response from LLM
                    analysis_result = chat_with_gemini(question, context)
                    
                    reasoning = analysis_result.get('reasoning', "No reasoning provided.")
                    code = analysis_result.get('code', "")
                    
                    st.markdown("#### 💬 Reasoning:")
                    st.markdown(reasoning)
                    
                    st.markdown("#### 🐍 Generated Code:")
                    st.code(code, language='python')
                    
                except Exception as e:
                    st.error(f"Step 1 Failed (LLM Interaction): {e}")
                    reasoning = ""
                    code = ""

            if code:
                with st.spinner("2. Executing code and generating output..."):
                    try:
                        # 2. Execute the generated Python code safely
                        
                        # IMPORTANT: Create a local scope with necessary variables
                        local_scope = {
                            'df': df, 
                            'st': st, 
                            'pd': pd,
                            'np': np,
                            'plt': plt,
                            'sns': sns, 
                        }
                        
                        # Append a neutral statement to the code to prevent implicit Streamlit display of the last value
                        final_code = code + "\nNone" 
                        
                        # Executing the code within the local scope
                        exec(final_code, globals(), local_scope)
                        
                        # FIX: Explicitly close all Matplotlib figures to prevent cross-run contamination
                        plt.close('all') 
                        
                        st.success("Code execution complete. Results are displayed above.")

                    except Exception as e:
                        st.error(f"Step 2 Failed (Code Execution Error): The agent generated invalid code. Check the console for full traceback.")
                        st.exception(e)
            else:
                st.info("No code was generated, as the question was purely informational.")
    else:
        st.info("The uploaded CSV file appears to be empty.")

else:
    st.info("👆 Upload a CSV file to begin the full analysis experience.")