Spaces:

sanjaystarc
/

data-analysis-agent

Sleeping

App Files Files Community

sanjaystarc commited on Nov 24, 2025

Commit

0b69e41

verified ·

1 Parent(s): fa75f31

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -31

app.py CHANGED Viewed

@@ -3,56 +3,166 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import requests
 # --- CONFIG ---
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     st.error("❌ Missing Gemini API key. Add it as a secret: GEMINI_API_KEY")
     st.stop()
 GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
-CHAT_MODEL = "models/gemini-2.5-flash-lite"
-EMBED_MODEL = "models/embedding-001"
 # --- Helper Functions ---
-def get_embedding(text):
-    url = f"{GEMINI_BASE}/{EMBED_MODEL}:embedText?key={GEMINI_API_KEY}"
-    data = {"text": text}
-    r = requests.post(url, json=data)
-    r.raise_for_status()
-    return r.json()["embedding"]["value"]
-def chat_with_gemini(prompt, context=""):
     url = f"{GEMINI_BASE}/{CHAT_MODEL}:generateContent?key={GEMINI_API_KEY}"
     payload = {
         "contents": [
-            {"parts": [{"text": f"{context}\n\nUser question: {prompt}"}]}
-        ]
     }
-    r = requests.post(url, json=payload)
-    r.raise_for_status()
-    data = r.json()
-    return data["candidates"][0]["content"]["parts"][0]["text"]
 # --- UI ---
-st.title("📊 Data Analyst Agent (Gemini + Streamlit)")
-st.write("Upload a CSV file and ask natural language questions about your data.")
 uploaded = st.file_uploader("Upload CSV", type=["csv"])
 if uploaded:
-    df = pd.read_csv(uploaded)
-    st.dataframe(df.head())
-    question = st.text_input("Ask a question about your data:")
-    if st.button("Analyze") and question:
-        # Summarize dataset for context
-        summary = f"Columns: {', '.join(df.columns)}. Example rows:\n{df.head(3).to_string(index=False)}"
         try:
-            response = chat_with_gemini(question, summary)
-            st.markdown("### 💬 Gemini Answer:")
-            st.write(response)
         except Exception as e:
-            st.error(f"Error: {e}")
-else:
-    st.info("👆 Upload a CSV file to begin.")

 import pandas as pd
 import numpy as np
 import requests
+import json
+import time # Ensure time is imported for backoff
 # --- CONFIG ---
+# Note: GEMINI_API_KEY is retrieved from environment variables/secrets.
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 if not GEMINI_API_KEY:
     st.error("❌ Missing Gemini API key. Add it as a secret: GEMINI_API_KEY")
     st.stop()
+# Define API endpoints and models
 GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
+CHAT_MODEL = "gemini-2.5-flash-preview-09-2025" # Using the correct model for structured output
+EMBED_MODEL = "models/embedding-001"
+# Define the JSON schema for structured output
+ANALYSIS_SCHEMA = {
+    "type": "OBJECT",
+    "properties": {
+        "reasoning": {
+            "type": "STRING",
+            "description": "A detailed natural language explanation of the analysis, including key findings and context."
+        },
+        "code": {
+            "type": "STRING",
+            "description": "The complete, runnable Python code using pandas (df) and streamlit (st). Use st.pyplot() for plots, and st.dataframe() for resulting DataFrames. If no code is needed, this should be an empty string."
+        }
+    }
+}
+SYSTEM_INSTRUCTION = (
+    "You are a world-class Data Analyst Agent. Your task is to analyze the provided DataFrame ('df') "
+    "based on the user's question. You MUST respond with a single JSON object conforming to the provided schema. "
+    "1. **Reasoning:** Explain your plan, the steps taken, and the insights derived from the data. Format this in Markdown. "
+    "2. **Code:** If the question requires calculation, aggregation, or visualization, you MUST generate Python code to execute against the 'df' DataFrame. "
+    "   - The DataFrame is already loaded as a variable named 'df'. Do NOT redefine it. "
+    "   - Use Streamlit functions for output: `st.dataframe(...)` for results, `st.bar_chart()`, `st.line_chart()`, or `st.pyplot()` for plots. "
+    "   - Use `import matplotlib.pyplot as plt` if creating custom plots. "
+    "   - Ensure the code is self-contained and ready to execute."
+)
 # --- Helper Functions ---
+# Function to chat with the Gemini API and enforce structured JSON output
+def chat_with_gemini(prompt, context):
+    """Sends a prompt and data context to the Gemini model for structured analysis (reasoning + code)."""
     url = f"{GEMINI_BASE}/{CHAT_MODEL}:generateContent?key={GEMINI_API_KEY}"
+    # Construct the full prompt including the data context
+    full_prompt = f"Data Context (DataFrame Head and Columns):\n{context}\n\nUser Question: {prompt}"
     payload = {
         "contents": [
+            {"parts": [{"text": full_prompt}]}
+        ],
+        "systemInstruction": {"parts": [{"text": SYSTEM_INSTRUCTION}]},
+        "generationConfig": {
+            "responseMimeType": "application/json",
+            "responseSchema": ANALYSIS_SCHEMA
+        }
     }
+    max_retries = 5
+    delay = 1
+    for attempt in range(max_retries):
+        try:
+            r = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(payload))
+            r.raise_for_status()
+            data = r.json()
+            # The JSON output is a string inside the 'text' part
+            json_str = data["candidates"][0]["content"]["parts"][0]["text"]
+            return json.loads(json_str)
+        except requests.exceptions.RequestException as e:
+            if attempt < max_retries - 1:
+                time.sleep(delay)
+                delay *= 2
+            else:
+                st.error(f"API Request Failed: {e}")
+                raise e
+        except Exception as e:
+            st.error(f"Failed to parse model response or execute operation: {e}")
+            raise e
 # --- UI ---
+st.title("✨ Perfect Data Analyst Agent (Code Execution Enabled)")
+st.write("Upload a CSV file and ask natural language questions. The agent now generates and executes Python code to provide precise data analysis and visualizations.")
+# State variable to hold the DataFrame, initialized once
+if 'df' not in st.session_state:
+    st.session_state.df = pd.DataFrame()
 uploaded = st.file_uploader("Upload CSV", type=["csv"])
 if uploaded:
+    # Use st.cache_data to avoid reloading the file multiple times
+    @st.cache_data
+    def load_data(file):
         try:
+            return pd.read_csv(file)
         except Exception as e:
+            st.error(f"Failed to load CSV: {e}")
+            return pd.DataFrame()
+    st.session_state.df = load_data(uploaded)
+    if not st.session_state.df.empty:
+        st.subheader("Data Preview (First 5 Rows)")
+        st.dataframe(st.session_state.df.head())
+        question = st.text_area("Ask a complex question or request a visualization (e.g., 'Show the average of the 'Sales' column', 'Plot the distribution of 'Age'):")
+        if st.button("Analyze & Execute") and question:
+            df = st.session_state.df # Local variable for code execution context
+            # Summarize dataset for context sent to the LLM
+            context = f"Dataset Columns: {', '.join(df.columns.astype(str))}\n\nFirst 5 rows of data:\n{df.head(5).to_markdown(index=False)}"
+            st.markdown("---")
+            st.subheader("🤖 Analysis Steps")
+            with st.spinner("1. Generating analysis plan and code..."):
+                try:
+                    # 1. Get structured response from LLM
+                    analysis_result = chat_with_gemini(question, context)
+                    reasoning = analysis_result.get('reasoning', "No reasoning provided.")
+                    code = analysis_result.get('code', "")
+                    st.markdown("#### 💬 Reasoning:")
+                    st.markdown(reasoning)
+                    st.markdown("#### 🐍 Generated Code:")
+                    st.code(code, language='python')
+                except Exception as e:
+                    st.error(f"Step 1 Failed (LLM Interaction): {e}")
+                    reasoning = ""
+                    code = ""
+            if code:
+                with st.spinner("2. Executing code and generating output..."):
+                    try:
+                        # 2. Execute the generated Python code safely
+                        # IMPORTANT: Create a local scope with necessary variables (df, st)
+                        local_scope = {
+                            'df': df,
+                            'st': st,
+                            'pd': pd,
+                            'np': np,
+                        }
+                        # Executing the code within the local scope
+                        exec(code, globals(), local_scope)
+                        st.success("Code execution complete. Results are displayed above.")
+                    except Exception as e:
+                        st.error(f"Step 2 Failed (Code Execution Error): The agent generated invalid code.")
+                        st.exception(e)