Spaces:
Sleeping
Sleeping
| """ | |
| Data Analyst Agent | |
| Agentic workflow: Question β Generate Code β Execute β Visualize | |
| """ | |
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import io | |
| import sys | |
| import os | |
| from contextlib import redirect_stdout, redirect_stderr | |
| import traceback | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '..')) | |
| from shared.components import create_method_panel, create_premium_hero | |
| # Initialize client | |
| client = InferenceClient(model="meta-llama/Llama-3.3-70B-Instruct") | |
| def safe_execute_code(code: str, df: pd.DataFrame, timeout: int = 5) -> tuple: | |
| """Safely execute pandas code with timeout""" | |
| try: | |
| # Create namespace with pandas and plotly | |
| namespace = { | |
| 'pd': pd, | |
| 'df': df, | |
| 'px': px, | |
| 'go': go, | |
| } | |
| # Capture output | |
| stdout_capture = io.StringIO() | |
| stderr_capture = io.StringIO() | |
| with redirect_stdout(stdout_capture), redirect_stderr(stderr_capture): | |
| exec(code, namespace) | |
| stdout_text = stdout_capture.getvalue() | |
| stderr_text = stderr_capture.getvalue() | |
| # Get result if exists | |
| result = namespace.get('result', None) | |
| fig = namespace.get('fig', None) | |
| return True, result, fig, stdout_text, stderr_text | |
| except Exception as e: | |
| return False, None, None, "", traceback.format_exc() | |
| def generate_analysis_code(question: str, df_info: str) -> str: | |
| """Generate pandas code using LLM""" | |
| if not os.getenv("HF_TOKEN"): | |
| q = question.lower() | |
| if "correlation" in q or "correlat" in q: | |
| return "result = df.select_dtypes(include='number').corr()" | |
| if "top" in q: | |
| return "result = df.head(10)" | |
| if "average" in q or "mean" in q: | |
| return "result = df.select_dtypes(include='number').mean().sort_values(ascending=False)" | |
| if "distribution" in q or "histogram" in q: | |
| return "numeric_cols = df.select_dtypes(include='number').columns\nresult = df[numeric_cols].describe()\nfig = px.histogram(df, x=numeric_cols[0]) if len(numeric_cols) else None" | |
| if "missing" in q or "null" in q: | |
| return "result = df.isna().sum().sort_values(ascending=False)" | |
| return "result = df.describe(include='all').transpose()" | |
| prompt = f"""You are a data analyst. Generate Python pandas code to answer this question. | |
| Dataset Info: | |
| {df_info} | |
| Question: {question} | |
| Requirements: | |
| 1. Use the dataframe 'df' (already loaded) | |
| 2. Store the final answer in a variable called 'result' | |
| 3. If creating a visualization, store it in 'fig' using plotly express (px) or plotly graph objects (go) | |
| 4. Keep code simple and clean | |
| 5. Add comments explaining key steps | |
| Generate ONLY the Python code, no explanations:""" | |
| code = "" | |
| for message in client.chat_completion( | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=800, | |
| stream=True, | |
| ): | |
| code += message.choices[0].delta.content or "" | |
| # Extract code from markdown if present | |
| if "```python" in code: | |
| code = code.split("```python")[1].split("```")[0].strip() | |
| elif "```" in code: | |
| code = code.split("```")[1].split("```")[0].strip() | |
| return code | |
| def analyze_data(csv_file, question, progress=gr.Progress()): | |
| """Main data analyst agent workflow""" | |
| if csv_file is None: | |
| return "Please upload a CSV file.", "", None | |
| if not question.strip(): | |
| return "Please enter a question.", "", None | |
| try: | |
| # Step 1: Load data | |
| progress(0.2, desc="Loading data...") | |
| df = pd.read_csv(csv_file.name) | |
| # Get dataset info | |
| buffer = io.StringIO() | |
| df.info(buf=buffer) | |
| df_info = buffer.getvalue() | |
| df_info += f"\n\nFirst few rows:\n{df.head().to_string()}\n" | |
| df_info += f"\nBasic statistics:\n{df.describe().to_string()}" | |
| # Step 2: Generate code | |
| progress(0.4, desc="Generating analysis code...") | |
| code = generate_analysis_code(question, df_info) | |
| # Step 3: Execute code | |
| progress(0.7, desc="Executing code...") | |
| success, result, fig, stdout, stderr = safe_execute_code(code, df) | |
| # Step 4: Format results | |
| progress(0.9, desc="Formatting results...") | |
| if not success: | |
| output = f"## β Execution Error\n\n```\n{stderr}\n```\n\n### Generated Code:\n```python\n{code}\n```" | |
| return output, code, None | |
| # Build output | |
| output = f"## β Analysis Complete\n\n### Question\n{question}\n\n" | |
| if result is not None: | |
| if isinstance(result, pd.DataFrame): | |
| output += f"### Result\n{result.to_markdown()}\n\n" | |
| else: | |
| output += f"### Result\n```\n{result}\n```\n\n" | |
| if stdout: | |
| output += f"### Output\n```\n{stdout}\n```\n\n" | |
| progress(1.0, desc="Complete!") | |
| return output, code, fig | |
| except Exception as e: | |
| return f"## β Error\n\n```\n{traceback.format_exc()}\n```", "", None | |
| # Gradio Interface | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Data Analyst Agent") as demo: | |
| create_premium_hero( | |
| "Data Analyst Agent", | |
| "Ask questions about a CSV and watch the agent generate pandas, execute safely, and return a visual analysis.", | |
| "π", | |
| badge="Agentic Analytics", | |
| highlights=["Code generation", "Sandboxed pandas", "Visual output"], | |
| ) | |
| create_method_panel({ | |
| "Workflow": "Question β schema inspection β code synthesis β constrained execution β chart/report.", | |
| "What it proves": "You can build agent workflows with boundaries, observability, and user-facing results.", | |
| "HF capability": "Pairs hosted instruction models with classic Python data tooling inside a Space.", | |
| }) | |
| with gr.Row(): | |
| with gr.Column(): | |
| csv_input = gr.File( | |
| label="Upload CSV", | |
| file_types=[".csv"] | |
| ) | |
| question_input = gr.Textbox( | |
| label="Ask a Question", | |
| placeholder="e.g., What's the average sales by region?", | |
| lines=3 | |
| ) | |
| analyze_btn = gr.Button("π¬ Analyze", variant="primary", size="lg") | |
| gr.Examples( | |
| examples=[ | |
| ["What are the top 5 values?"], | |
| ["Calculate average by category"], | |
| ["Show distribution with a histogram"], | |
| ["Find correlations between numeric columns"], | |
| ], | |
| inputs=question_input | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| output = gr.Markdown(label="Results") | |
| with gr.Row(): | |
| with gr.Column(): | |
| code_output = gr.Code( | |
| label="Generated Code", | |
| language="python" | |
| ) | |
| with gr.Column(): | |
| plot_output = gr.Plot(label="Visualization") | |
| analyze_btn.click( | |
| fn=analyze_data, | |
| inputs=[csv_input, question_input], | |
| outputs=[output, code_output, plot_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |