Spaces:

KarthikMuraliM
/

DataAnalystAgent

Sleeping

App Files Files Community

KarthikMuraliM commited on Aug 12, 2025

Commit

c0897c1

1 Parent(s): 553f95b

Initial commit from existing project

Browse files

Files changed (7) hide show

.gitignore +130 -0
DockerFile +46 -0
LICENSE +21 -0
README copy.md +1 -0
main.py +229 -0
requirements.txt +30 -0
tools.py +758 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,130 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+*.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

DockerFile ADDED Viewed

	@@ -0,0 +1,46 @@

+# Dockerfile
+# Start from a standard Python 3.11 base image
+FROM python:3.11-slim
+# Set the working directory inside the container
+WORKDIR /app
+# 1. Install System-Level Dependencies
+# We need Git (good practice), and Java for the tabula-py library.
+# We also add 'curl' and 'unzip' which can be useful system tools.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    unzip \
+    git \
+    default-jre \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# 2. Install 'uv', the fast Python package manager
+# This makes the subsequent dependency installation much faster.
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.cargo/bin:${PATH}"
+# 3. Install Python Dependencies using 'uv'
+# Copy requirements file first to leverage Docker layer caching
+COPY requirements.txt .
+# Use 'uv pip install' which is a super-fast drop-in replacement for 'pip install'
+RUN uv pip install --no-cache-dir -r requirements.txt
+# 4. Install Playwright Browsers
+# This command installs the necessary browsers for the scrape_dynamic_site tool
+RUN playwright install
+# 5. Copy Your Application Code
+# Copy main.py, tools.py, etc., into the container's /app directory
+COPY . .
+# 6. Expose the Port
+# Tell Docker that the container will listen on port 8000
+EXPOSE 8000
+# 7. Define the Startup Command
+# This command runs your FastAPI server. The --host 0.0.0.0 is ESSENTIAL.
+# We use 'uvicorn' here as 'uv' does not yet have a stable application server.
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 KarthikMurali-M
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README copy.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Data-Analyst-Agent-API

main.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import re
+from fastapi import FastAPI, UploadFile, File, HTTPException, status
+from typing import List,Dict
+from pathlib import Path
+import os
+import openai
+import tempfile  # <-- Import tempfile
+import json
+from tools import fetch_url, python_interpreter,get_dataframe_info ,calculate_correlation,create_pivot_table,run_sql_query,get_sentiment,scrape_wikipedia_summary,scrape_pdf_tables ,analyze_image_content ,geocode_address ,scrape_dynamic_site , parse_html ,get_bbc_weather ,TOOL_DEFINITIONS
+import asyncio
+# Directory where files will be saved
+UPLOAD_DIR = Path("uploaded_files")
+UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+import re
+import time
+# --- Load Environment Variables ---
+from dotenv import load_dotenv
+load_dotenv()
+if "AIPIPE_TOKEN" not in os.environ:
+    raise RuntimeError("The AIPIPE_TOKEN environment variable is not set. Please set it to your token from aipipe.org.")
+# Configure the OpenAI client to point to the AI Pipe proxy.
+# The client will automatically use the OPENAI_API_KEY environment variable,
+# so we set it to our AI Pipe token.
+client = openai.OpenAI(
+    base_url="https://aipipe.org/openrouter/v1",
+    api_key=os.getenv("AIPIPE_TOKEN"),
+)
+AVAILABLE_TOOLS: Dict[str, callable] = {
+    "fetch_url": fetch_url,
+    "python_interpreter": python_interpreter,
+    "get_dataframe_info": get_dataframe_info,
+    "calculate_correlation": calculate_correlation,
+    "create_pivot_table": create_pivot_table,
+    "run_sql_query": run_sql_query,
+    "get_sentiment": get_sentiment,
+    "scrape_wikipedia_summary": scrape_wikipedia_summary,
+    "scrape_pdf_tables": scrape_pdf_tables,
+    "analyze_image_content": analyze_image_content,
+    "geocode_address": geocode_address,
+    "scrape_dynamic_site": scrape_dynamic_site,
+    "parse_html": parse_html,
+    "get_bbc_weather": get_bbc_weather
+}
+def is_output_valid(result: str | None) -> bool:
+    """A simple validator to check if the agent's output is complete."""
+    if result is None or result.strip() == "":
+        return False
+    try:
+        data = json.loads(result)
+        # Check for common failure indicators like null values or "N/A"
+        if isinstance(data, list) and any(x is None or "N/A" in str(x) for x in data):
+            return False
+        if isinstance(data, dict) and any(v is None for v in data.values()):
+            return False
+    except (json.JSONDecodeError, TypeError):
+        return False # Not valid JSON
+    return True
+app = FastAPI(
+    title="Data Analyst Agent API",
+    description="An API that uses LLMs to source, prepare, analyze, and visualize data.  ;By 24f2001293@ds.study.iitm.ac.in",
+)
+@app.post("/api/")
+async def process_analysis_request(files: List[UploadFile] = File(...)):
+    # This is the main function that will be rewritten.
+    max_retries = 3
+    last_error = ""
+    last_plan = {}
+    with tempfile.TemporaryDirectory() as work_dir:
+        # --- 1. FILE HANDLING (Using the Robust In-Memory Method) ---
+        work_path = Path(work_dir)
+        if not files: raise HTTPException(status.HTTP_400_BAD_REQUEST, "No files uploaded.")
+        file_contents = {f.filename: await f.read() for f in files}
+        questions_file_name, attached_file_names = None, []
+        first_txt_file_name = None
+        q_pattern = re.compile(r'question', re.IGNORECASE)
+        for filename in file_contents.keys():
+            is_txt = filename.lower().endswith(".txt")
+            if is_txt and first_txt_file_name is None:
+                first_txt_file_name = filename
+            if is_txt and q_pattern.search(filename):
+                if questions_file_name is None: questions_file_name = filename
+                else: attached_file_names.append(filename)
+            else:
+                attached_file_names.append(filename)
+        if questions_file_name is None:
+            questions_file_name = first_txt_file_name
+            if questions_file_name in attached_file_names:
+                attached_file_names.remove(questions_file_name)
+        if not questions_file_name: raise HTTPException(status.HTTP_400_BAD_REQUEST, "No .txt question file found.")
+        task_content = file_contents[questions_file_name].decode("utf-8")
+        for filename, content in file_contents.items():
+            file_path = work_path / filename
+            with open(file_path, "wb") as f: f.write(content)
+        # ----------------------------------------------------
+        time.sleep(0.1)
+        # --- SELF-CORRECTION LOOP ---
+        for i in range(max_retries):
+            print(f"\n--- AGENT ATTEMPT #{i + 1} ---")
+            # --- 2. PLANNING ---
+            AVAILABLE_TOOLS: Dict[str, callable] = { "fetch_url": fetch_url, "python_interpreter": python_interpreter, "get_dataframe_info": get_dataframe_info, "calculate_correlation": calculate_correlation, "create_pivot_table": create_pivot_table, "run_sql_query": run_sql_query, "get_sentiment": get_sentiment, "scrape_wikipedia_summary": scrape_wikipedia_summary, "scrape_pdf_tables": scrape_pdf_tables, "analyze_image_content": analyze_image_content, "geocode_address": geocode_address, "scrape_dynamic_site": scrape_dynamic_site, "parse_html": parse_html, "get_bbc_weather": get_bbc_weather, }
+            planner_system_prompt =  f"""
+        You are an expert-level data analysis planner. Your purpose is to convert a user's request into a step-by-step JSON execution plan.
+        You have been provided with the following available tools:
+        {", ".join(AVAILABLE_TOOLS.keys())}
+        You must decide on the best strategy to fulfill the request. You have two strategies available:
+        **Strategy 1: Use a Specialized Tool.**
+        If the user's request can be answered directly and completely by a single call to one of the specialized tools (e.g., `get_bbc_weather`, `geocode_address`, `get_sentiment`), you MUST generate a simple, one-step plan that calls that tool. This is your preferred strategy for simple, direct requests. USE THE SAME NAMES GIVEN TO YOU TO CREATE THE TOOL CALLS.
+        **Strategy 2: Generate a Single Python Script.**
+        If the user's request is complex, requires multiple steps, data manipulation, or cannot be handled by a single specialized tool, you MUST generate a plan containing a SINGLE step that uses the `python_interpreter`. This single step must contain a complete, self-contained Python script that performs all the necessary actions and prints the final JSON output.
+        CRITICAL RULES:
+        1.  **CHOOSE A STRATEGY:** First, analyze the request. Is it a simple task for a specialized tool, or a complex one requiring a full script?
+        2.  **TOOL NAMES:** You MUST use the exact tool names from the provided list.
+        3.  **DATA CLEANING (IMPORTANT):** When you load a CSV file using pandas, the column names might have leading/trailing whitespace. Your first step after loading the data MUST be to clean the column names. A good method is: `df.columns = df.columns.str.strip()`.
+        4.  **FINAL OUTPUT (CRITICAL):** You MUST read the user's request very carefully to determine the exact final output format.
+            - If the user asks for a **JSON object with specific keys**, your script's final print statement MUST produce a JSON object with EXACTLY those keys and data types.
+            - If the user asks for a **JSON array**, your script's final print statement MUST produce a JSON array with the raw values in the correct order.
+        5. **NO PLACEHOLDERS:** You MUST perform the actual calculations and data analysis required. Do not use placeholder or example values in your final output. The results must be derived from the provided data sources.
+        6.  Your entire output MUST be ONLY a valid JSON object representing the execution plan. The plan should follow this schema: {{"plan": {{"steps": [{{...}}]}}}}
+        """
+            # On retries (i > 0), add the context of the last failure
+            if i > 0:
+                user_prompt = f"The previous attempt failed.\nPREVIOUS PLAN:\n{json.dumps(last_plan, indent=2)}\n\nPREVIOUS ERROR/OUTPUT:\n{last_error}\n\nPlease analyze the error and generate a new, corrected plan to fulfill the original request:\n{task_content}"
+            else:
+                user_prompt = f"--- USER REQUEST ---\n{task_content}\n\n--- AVAILABLE FILES ---\n{attached_file_names}"
+            print("--- Calling Planner LLM to create execution plan ---")
+            planner_messages = [{"role": "system", "content": planner_system_prompt}, {"role": "user", "content": user_prompt}]
+            try:
+                response = client.chat.completions.create(model="openai/gpt-5-nano", messages=planner_messages, response_format={"type": "json_object"})
+                plan_str = response.choices[0].message.content
+                plan = json.loads(plan_str)
+                print("\n\n--- 🕵️ DECODING: PLAN RECEIVED 🕵️ ---")
+                print(json.dumps(plan, indent=2))
+                print("-----------------------------------------\n")
+                last_plan = plan
+                print("--- Plan received from Planner ---")
+            except Exception as e:
+                last_error = f"Planner failed to generate a valid JSON plan: {e}"
+                if i < max_retries - 1: continue # Go to the next retry attempt
+                else: break # Exit loop if retries are exhausted
+            # --- 3. EXECUTION ---
+            print("--- Starting Worker execution ---")
+            final_result = None
+            try:
+                plan_steps = plan.get("plan", {}).get("steps", [])
+                if not plan_steps: raise ValueError("The generated plan contains no steps.")
+                for step_data in plan_steps:
+                    # Use the final, robust worker logic from our last iteration
+                    tool_name = step_data.get("tool_name", step_data.get("tool", step_data.get("action", step_data.get("name"))))
+                    if not tool_name: raise ValueError("Plan step is missing a 'tool' or 'action' key.")
+                    tool_function = AVAILABLE_TOOLS.get(tool_name)
+                    if not tool_function: raise ValueError(f"Plan requested an unknown tool: '{tool_name}'")
+                    known_keys = ["step", "id", "tool", "tool_name", "action", "description", "notes", "output"]
+                    arguments = {k: v for k, v in step_data.items() if k not in known_keys}
+                    if "script" in arguments and "code" not in arguments: arguments["code"] = arguments.pop("script")
+                    # Add special context arguments
+                    if tool_name in ["python_interpreter", "get_dataframe_info", "calculate_correlation", "create_pivot_table", "scrape_pdf_tables", "analyze_image_content", "scrape_dynamic_site", "parse_html"]: arguments["work_dir"] = work_dir
+                    if tool_name in ["get_sentiment", "analyze_image_content"]: arguments["client"] = client
+                    # Execute the tool
+                    if asyncio.iscoroutinefunction(tool_function): output = await tool_function(**arguments)
+                    else: output = tool_function(**arguments)
+                    final_result = output
+                # --- 4. VALIDATION ---
+                if is_output_valid(final_result):
+                    print("--- Output is valid. Task complete. ---")
+                    print("\n\n--- ✅ DECODING: FINAL VALID OUTPUT ✅ ---")
+                    print(final_result)
+                    print("------------------------------------------\n")
+                    return json.loads(final_result)
+                else:
+                    print("--- Output is invalid. Triggering self-correction. ---")
+                    last_error = f"The script executed but produced an invalid result: {final_result}"
+            except Exception as e:
+                print(f"--- Execution failed. Triggering self-correction. ---")
+                last_error = f"The worker failed to execute the plan. Error: {repr(e)}"
+        # If all retries fail, raise the final error
+        print(f"--- AGENT FAILED: All {max_retries} attempts exhausted. Returning empty JSON. ---")
+        print(f"Last known error was: {last_error}")
+        # We need to figure out the expected format (list or dict) from the question.
+        # A simple heuristic: if the question asks for a JSON object, return {}.
+        # If it asks for a JSON array, return [].
+        if "JSON object" in task_content:
+            return {}
+        elif "JSON array" in task_content:
+            return []
+        else:
+            # A safe default if the format is not specified
+            return {}

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+fastapi
+openai
+requests
+uvicorn
+python-dotenv
+python-multipart
+pandas
+matplotlib
+numpy
+scikit-learn
+pandas
+pyarrow
+tabula-py
+Pillow
+beautifulsoup4
+lxml
+httpx
+scikit-learn
+geopy
+networkx
+duckdb-engine
+sqlalchemy
+mysql-connector-python
+wikipedia
+tabula-py
+playwright
+beautifulsoup4
+lxml
+boto3
+pyarrow

tools.py ADDED Viewed

	@@ -0,0 +1,758 @@

+# tools.py
+import subprocess
+import requests
+import base64
+from pathlib import Path
+import sys
+import json
+import pandas as pd
+from typing import List,Dict
+import io
+from sqlalchemy import create_engine, text
+import openai
+import wikipedia
+import numpy as np
+import tabula
+from PIL import Image
+import base64
+from geopy.geocoders import Nominatim
+from playwright.async_api import async_playwright
+import asyncio
+from bs4 import BeautifulSoup
+import requests
+TOOL_DEFINITIONS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "fetch_url",
+            "description": "Fetches the text content from a given URL. Use this for scraping websites or getting data from online sources.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "url": {
+                        "type": "string",
+                        "description": "The complete URL to fetch content from.",
+                    },
+                },
+                "required": ["url"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "python_interpreter",
+            "description": (
+                "Executes Python code in an isolated environment for data analysis, manipulation, and visualization. "
+                "The environment has pandas, matplotlib, numpy, and scikit-learn available. "
+                "The code can access user-uploaded files directly by their filename (e.g., pd.read_csv('data.csv')). "
+                "To return a plot, save it as 'output.png'. All print() output is captured as the result."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string",
+                        "description": "The Python code to execute.",
+                    },
+                },
+                "required": ["code"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "get_dataframe_info",
+            "description": "Reads a data file (like a .csv or .parquet) and returns a JSON summary including column names, data types, non-null counts, and descriptive statistics (mean, std, min, max, etc.). This is the best first step for understanding any dataset.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "The filename of the data file to analyze (e.g., 'data.csv').",
+                    },
+                },
+                "required": ["file_path"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "calculate_correlation",
+            "description": "Computes the Pearson correlation coefficient between two specific numerical columns in a given data file. The name of this function is `calculate_correlation`.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "The filename of the data file (e.g., 'data.csv')."},
+                    "column1": {"type": "string", "description": "The name of the first column."},
+                    "column2": {"type": "string", "description": "The name of the second column."},
+                },
+                "required": ["file_path", "column1", "column2"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "create_pivot_table",
+            "description": "Generates a pivot table to summarize data. This function takes a file and the names of the columns to use for the index, columns, and values of the pivot table. The name of this function is `create_pivot_table`.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {"type": "string", "description": "The filename of the data file (e.g., 'data.csv')."},
+                    "index": {"type": "string", "description": "The name of the column to use as the pivot table's index (rows)."},
+                    "columns": {"type": "string", "description": "The name of the column to use as the pivot table's columns."},
+                    "values": {"type": "string", "description": "The name of the column to aggregate as the values in the pivot table."},
+                },
+                "required": ["file_path", "index", "columns", "values"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "run_sql_query",
+            "description": "Executes a SQL query against a database (like SQLite or DuckDB) and returns the result as JSON. The name of this function is `run_sql_query`.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {
+                        "type": "string",
+                        "description": "The SQL query to execute.",
+                    },
+                    "db_connection_string": {
+                        "type": "string",
+                        "description": "The SQLAlchemy connection string for the database. For an uploaded SQLite file named 'my_db.db', use 'sqlite:///my_db.db'. For a DuckDB file, use 'duckdb:///my_db.duckdb'.",
+                    },
+                },
+                "required": ["query", "db_connection_string"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "get_sentiment",
+            "description": "Analyzes a piece of text (like a movie review) to determine if its sentiment is positive, negative, or neutral. The name of this function is `get_sentiment`.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text_to_analyze": {
+                        "type": "string",
+                        "description": "The text content to be analyzed.",
+                    },
+                },
+                "required": ["text_to_analyze"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "scrape_wikipedia_summary",
+            "description": "Fetches the clean text summary from a Wikipedia page. Use this tool specifically for getting information from Wikipedia. The name of this function is `scrape_wikipedia_summary`.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {
+                        "type": "string",
+                        "description": "The title or search query for the Wikipedia page (e.g., 'Python (programming language)').",
+                    },
+                },
+                "required": ["query"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "scrape_pdf_tables",
+            "description": "Extracts all tabular data from a PDF document and returns it as a list of JSON objects. Use this for any PDF that contains tables. The name of this function is `scrape_pdf_tables`.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "file_path": {
+                        "type": "string",
+                        "description": "The filename of the PDF file to process (e.g., 'report.pdf').",
+                    },
+                },
+                "required": ["file_path"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "analyze_image_content",
+            "description": "Analyzes an uploaded image file (e.g., a PNG or JPG) and answers a specific question about its contents. Use this to identify objects, read text, or describe scenes in an image. The name of this function is `analyze_image_content`.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "image_path": {"type": "string", "description": "The filename of the image to analyze (e.g., 'chart.png')."},
+                    "prompt": {"type": "string", "description": "The specific question to ask about the image (e.g., 'What is the title of this chart?', 'Is there a cat in this picture?')."},
+                },
+                "required": ["image_path", "prompt"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "geocode_address",
+            "description": "Finds the geographic coordinates (latitude and longitude) for a given street address, city, or landmark. Uses the Nominatim service. The name of this function is `geocode_address`.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "address": {
+                        "type": "string",
+                        "description": "The address or place name to geocode (e.g., '1600 Amphitheatre Parkway, Mountain View, CA' or 'Tokyo Tower').",
+                    },
+                },
+                "required": ["address"],
+            },
+        },
+    },
+        {
+        "type": "function",
+        "function": {
+            "name": "scrape_dynamic_site",
+            "description":
+"Renders a JavaScript-heavy website and saves the complete HTML to a file named 'scraped_page.html'. This is the first step in a two-step process. After calling this, use the 'parse_html' tool to extract specific data from the saved file. The name of this function is `scrape_dynamic_site`.",            "parameters": {
+                "type": "object",
+                "properties": {
+                    "url": {"type": "string", "description": "The URL of the dynamic website to scrape."},
+                },
+                "required": ["url"],
+            },
+        },
+    },
+    {
+    "type": "function",
+    "function": {
+        "name": "parse_html",
+        "description": "Extracts specific data from an HTML file (like one saved by 'scrape_dynamic_site') using CSS selectors. Provide a dictionary where keys are desired data names and values are the CSS selectors to find that data. The name of this function is `parse_html`.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "file_path": {"type": "string", "description": "The local filename of the HTML file to parse (e.g., 'scraped_page.html')."},
+                "selectors": {
+                    "type": "object",
+                    "description": "A JSON object of 'data_name': 'css_selector' pairs. For example: {\"titles\": \"h2.product-title\", \"prices\": \".price-tag\"}",
+                },
+            },
+            "required": ["file_path", "selectors"],
+        },
+    },
+},
+{
+    "type": "function",
+    "function": {
+        "name": "get_bbc_weather",
+        "description": "Fetches the weather forecast for a location using its BBC Weather ID. Can provide a 3-day summary or a detailed hour-by-hour forecast. The name of this function is `get_bbc_weather`.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location_id": {
+                    "type": "string",
+                    "description": "The numerical ID for the location (e.g., '2643743' for London).",
+                },
+                "report_type": {
+                    "type": "string",
+                    "description": "The type of report to generate. Use 'summary' for a 3-day overview or 'detailed' for an hour-by-hour forecast.",
+                    "enum": ["summary", "detailed"], # 'enum' helps the LLM choose a valid option
+                },
+            },
+            "required": ["location_id"],
+        },
+    },
+},
+]
+def get_bbc_weather(location_id: str, report_type: str = 'summary') -> str:
+    """
+    Fetches the weather forecast for a given BBC Weather location ID.
+    Can return a 'summary' (default) or a 'detailed' hour-by-hour report.
+    """
+    print(f"Executing Tool 'get_bbc_weather' for ID: {location_id}, Type: {report_type}")
+    url = f"https://weather-broker-cdn.api.bbci.co.uk/en/forecast/aggregated/{location_id}"
+    try:
+        response = requests.get(url, timeout=15)
+        response.raise_for_status()
+        weather_data = response.json()
+        forecasts_data = weather_data.get("forecasts", [])
+        if not forecasts_data:
+            return "Error: Forecast data not found in the API response."
+        report = forecasts_data[0]
+        location_name = report.get("location", {}).get("name")
+        # --- NEW LOGIC ---
+        if report_type == 'detailed':
+            # Extract the detailed, timeseries forecast
+            detailed_forecast = {
+                "location_name": location_name,
+                "issued_at": report.get("issuedAt"),
+                "detailed_forecast": []
+            }
+            for slot in report.get("detailed", {}).get("reports", []):
+                hour_summary = {
+                    "timestamp": slot.get("localDate"),
+                    "temperature_c": slot.get("temperatureC"),
+                    "feels_like_temp_c": slot.get("feelsLikeTempC"),
+                    "wind_speed_mph": slot.get("windSpeedMph"),
+                    "wind_direction": slot.get("windDirectionAbbreviation"),
+                    "precipitation_probability_percent": slot.get("precipitationProbabilityInPercent"),
+                    "weather_type": slot.get("weatherType")
+                }
+                detailed_forecast["detailed_forecast"].append(hour_summary)
+            return json.dumps(detailed_forecast, indent=2)
+        else: # Default to 'summary'
+            # The existing summary logic
+            summary_report = {
+                "location_name": location_name,
+                "issued_at": report.get("issuedAt"),
+                "daily_summary": []
+            }
+            for day in report.get("summary", {}).get("reports", []):
+                day_summary = {
+                    "date": day.get("localDate"),
+                    "condition": day.get("weatherType"),
+                    "max_temp_c": day.get("maxTempC"),
+                    "min_temp_c": day.get("minTempC"),
+                }
+                summary_report["daily_summary"].append(day_summary)
+            return json.dumps(summary_report, indent=2)
+    except Exception as e:
+        return f"An error occurred while processing weather data. Error: {e}"
+def parse_html(file_path: str, selectors: Dict[str, str], work_dir: str) -> str:
+    """
+    Parses a local HTML file and extracts data using a dictionary of CSS selectors.
+    For each key-value pair in the selectors dictionary, it finds elements matching
+    the selector (value) and stores their text content under the given key.
+    """
+    print(f"Executing Tool 'parse_html' for file: {file_path}")
+    full_path = Path(work_dir) / file_path
+    if not full_path.exists():
+        return f"Error: HTML file not found at {full_path}"
+    try:
+        with open(full_path, "r", encoding="utf-8") as f:
+            html_content = f.read()
+        soup = BeautifulSoup(html_content, "lxml")
+        extracted_data = {}
+        for data_key, selector in selectors.items():
+            # Find all elements matching the selector
+            elements = soup.select(selector)
+            # Extract the text from each element, stripping whitespace
+            extracted_data[data_key] = [el.get_text(strip=True) for el in elements]
+        return json.dumps(extracted_data, indent=2)
+    except Exception as e:
+        return f"Failed to parse HTML file {file_path}. Error: {e}"
+async def scrape_dynamic_site(url: str, work_dir: str) -> str:
+    """
+    Renders a JavaScript-heavy website using a headless browser and saves the
+    complete, final HTML to a file named 'scraped_page.html'.
+    """
+    print(f"Executing Tool 'scrape_dynamic_site' for url: {url}")
+    save_path = Path(work_dir) / "scraped_page.html"
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch()
+            page = await browser.new_page()
+            await page.goto(url, wait_until='networkidle', timeout=30000) # 30s timeout
+            content = await page.content()
+            await browser.close()
+        # Save the full HTML content to the specified file
+        with open(save_path, "w", encoding="utf-8") as f:
+            f.write(content)
+        # Return a success message with the path to the saved file
+        return json.dumps({
+            "status": "success",
+            "url": url,
+            "saved_to": str(save_path.name) # Return just the filename
+        })
+    except Exception as e:
+        return f"Failed to scrape dynamic site {url}. Error: {e}"
+def geocode_address(address: str) -> str:
+    """
+    Converts a physical address or place name into geographic coordinates (latitude and longitude).
+    """
+    print(f"Executing Tool 'geocode_address' for address: {address}")
+    try:
+        # Create a geolocator instance. A unique user_agent is good practice.
+        geolocator = Nominatim(user_agent="data_analyst_agent_v1")
+        location = geolocator.geocode(address)
+        if location is None:
+            return f"Error: Could not find coordinates for the address '{address}'."
+        result = {
+            "address": address,
+            "latitude": location.latitude,
+            "longitude": location.longitude,
+            "full_address_found": location.address
+        }
+        return json.dumps(result, indent=2)
+    except Exception as e:
+        return f"Failed to geocode address. Error: {e}"
+def analyze_image_content(image_path: str, prompt: str, work_dir: str, client: openai.Client) -> str:
+    """
+    Analyzes the content of an image file using a multimodal LLM and answers a question about it.
+    """
+    print(f"Executing Tool 'analyze_image_content' for file: {image_path}")
+    full_path = Path(work_dir) / image_path
+    if not full_path.exists():
+        return f"Error: Image file not found at {full_path}"
+    try:
+        # Open the image to verify it's a valid image file (optional but good practice)
+        Image.open(full_path)
+        # Encode the image to base64
+        with open(full_path, "rb") as image_file:
+            base64_image = base64.b64encode(image_file.read()).decode('utf-8')
+        # Call the multimodal model
+        response = client.chat.completions.create(
+            model="openai/gpt-4.1-nano",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
+                    ],
+                }
+            ],
+            max_tokens=500, # Allow for a reasonably detailed description
+        )
+        description = response.choices[0].message.content
+        return json.dumps({"image": image_path, "analysis": description})
+    except Exception as e:
+        return f"Failed to analyze image. Error: {e}"
+def scrape_wikipedia_summary(query: str) -> str:
+    """
+    Fetches the summary section of a Wikipedia page based on a search query.
+    """
+    print(f"Executing Tool 'scrape_wikipedia_summary' for query: {query}")
+    try:
+        # Fetch the summary of the page
+        summary = wikipedia.summary(query, auto_suggest=True)
+        result = {
+            "query": query,
+            "summary": summary
+        }
+        return json.dumps(result, indent=2)
+    except wikipedia.exceptions.PageError:
+        return f"Error: Could not find a Wikipedia page for the query '{query}'."
+    except wikipedia.exceptions.DisambiguationError as e:
+        return f"Error: The query '{query}' is ambiguous. It could refer to any of the following: {e.options}"
+    except Exception as e:
+        return f"Failed to scrape Wikipedia. Error: {e}"
+def scrape_pdf_tables(file_path: str, work_dir: str) -> str:
+    """
+    Extracts all tables from a specified page in a PDF file.
+    """
+    print(f"Executing Tool 'scrape_pdf_tables' for file: {file_path}")
+    full_path = Path(work_dir) / file_path
+    if not full_path.exists():
+        return f"Error: PDF file not found at {full_path}"
+    try:
+        # read_pdf returns a list of DataFrames, one for each table found
+        tables_as_dfs = tabula.read_pdf(full_path, pages='all', multiple_tables=True)
+        if not tables_as_dfs:
+            return "No tables were found in the PDF file."
+        # Convert each DataFrame in the list to a JSON string
+        tables_as_json = [df.to_json(orient='split') for df in tables_as_dfs]
+        # Return a JSON object containing the list of tables
+        return json.dumps({"file_name": file_path, "extracted_tables": tables_as_json})
+    except Exception as e:
+        return f"Failed to scrape tables from PDF. Make sure Java is installed on the system. Error: {e}"
+def get_sentiment(text_to_analyze: str, client: openai.Client) -> str:
+    """
+    Analyzes the sentiment of a given piece of text.
+    """
+    print(f"Executing Tool 'get_sentiment'")
+    try:
+        # We use a specific, constrained prompt to force the LLM to be a classifier
+        response = client.chat.completions.create(
+            model="openai/gpt-5-nano", # Use a fast and cheap model for this simple task
+            messages=[
+                {"role": "system", "content": "You are a sentiment analysis tool. Classify the user's text as 'positive', 'negative', or 'neutral'. Respond with only one of these three words and nothing else."},
+                {"role": "user", "content": text_to_analyze}
+            ],
+            max_tokens=5, # Limit the output to a single word
+            temperature=0.0 # Make the output deterministic
+        )
+        sentiment = response.choices[0].message.content.lower().strip()
+        # Basic validation
+        if sentiment not in ["positive", "negative", "neutral"]:
+            return "Error: Could not determine a valid sentiment."
+        return json.dumps({"text": text_to_analyze, "sentiment": sentiment})
+    except Exception as e:
+        return f"Failed to get sentiment. Error: {e}"
+def run_sql_query(query: str, db_connection_string: str) -> str:
+    """
+    Executes a SQL query against a specified database and returns the result.
+    Supports file-based databases like SQLite and DuckDB.
+    For SQLite, the connection string should be 'sqlite:///path/to/database.db'.
+    The path should be relative to the agent's working directory.
+    """
+    print(f"Executing Tool 'run_sql_query'")
+    try:
+        # Create a database engine from the connection string
+        engine = create_engine(db_connection_string)
+        # Execute the query and fetch results into a pandas DataFrame
+        with engine.connect() as connection:
+            result_df = pd.read_sql_query(sql=text(query), con=connection)
+        # Return the result as a JSON string
+        return result_df.to_json(orient="records")
+    except Exception as e:
+        return f"Failed to execute SQL query. Error: {e}"
+def calculate_correlation(file_path: str, column1: str, column2: str, work_dir: str) -> str:
+    """
+    Calculates the Pearson correlation coefficient between two specified columns in a data file.
+    """
+    print(f"Executing Tool 'calculate_correlation' for file: {file_path}")
+    full_path = Path(work_dir) / file_path
+    if not full_path.exists():
+        return f"Error: Data file not found at {full_path}"
+    try:
+        if file_path.lower().endswith('.csv'):
+            df = pd.read_csv(full_path)
+        elif file_path.lower().endswith('.parquet'):
+            df = pd.read_parquet(full_path)
+        else:
+            return f"Error: Unsupported file type."
+        # Ensure columns exist
+        if column1 not in df.columns or column2 not in df.columns:
+            return f"Error: One or both columns ('{column1}', '{column2}') not found in the file."
+        # Calculate correlation
+        correlation = df[column1].corr(df[column2])
+        result = {
+            "file_name": file_path,
+            "column_1": column1,
+            "column_2": column2,
+            "pearson_correlation": correlation
+        }
+        return json.dumps(result, indent=2)
+    except Exception as e:
+        return f"Failed to calculate correlation. Error: {e}"
+def create_pivot_table(file_path: str, index: str, columns: str, values: str, work_dir: str) -> str:
+    """
+    Creates a pivot table from the data in the specified file.
+    """
+    print(f"Executing Tool 'create_pivot_table' for file: {file_path}")
+    full_path = Path(work_dir) / file_path
+    if not full_path.exists():
+        return f"Error: Data file not found at {full_path}"
+    try:
+        if file_path.lower().endswith('.csv'):
+            df = pd.read_csv(full_path)
+        elif file_path.lower().endswith('.parquet'):
+            df = pd.read_parquet(full_path)
+        else:
+            return f"Error: Unsupported file type."
+        # Create the pivot table
+        pivot_table = pd.pivot_table(df, values=values, index=index, columns=columns, aggfunc=np.sum)
+        # Return the pivot table as a JSON string
+        return pivot_table.to_json(orient="split")
+    except Exception as e:
+        return f"Failed to create pivot table. Error: {e}"
+def get_dataframe_info(file_path: str, work_dir: str) -> str:
+    """
+    Reads a data file (CSV, Parquet) and returns a summary of its contents.
+    The summary includes column names, data types, and basic statistics.
+    """
+    print(f"Executing Tool 'get_dataframe_info' for file: {file_path}")
+    full_path = Path(work_dir) / file_path
+    if not full_path.exists():
+        return f"Error: Data file not found at {full_path}"
+    try:
+        if file_path.lower().endswith('.csv'):
+            df = pd.read_csv(full_path)
+        elif file_path.lower().endswith('.parquet'):
+            df = pd.read_parquet(full_path)
+        else:
+            return f"Error: Unsupported file type. Only .csv and .parquet are supported."
+        # Use a string buffer to capture the output of df.info()
+        info_buffer = io.StringIO()
+        df.info(buf=info_buffer)
+        info_str = info_buffer.getvalue()
+        # Get the statistical summary
+        describe_df = df.describe(include='all')
+        # Combine everything into a single, informative string
+        summary = {
+            "file_name": file_path,
+            "info": info_str,
+            "statistical_summary": describe_df.to_json(orient="split")
+        }
+        return json.dumps(summary, indent=2)
+    except Exception as e:
+        return f"Failed to get DataFrame info. Error: {e}"
+def fetch_url(url: str) -> str:
+    """Fetches text content from a specified URL using the AI Pipe proxy."""
+    print(f"Executing Tool 'fetch_url' with URL: {url}")
+    try:
+        proxy_url = f"https://aipipe.org/proxy/{url}"
+        response = requests.get(proxy_url, timeout=30)
+        response.raise_for_status()
+        return response.text
+    except requests.RequestException as e:
+        return f"Error: Failed to fetch URL {url}. Reason: {e}"
+def python_interpreter(code: str, work_dir: str) -> str:
+    """
+    Executes Python code in a sandboxed subprocess within a specific working directory.
+    The code can access any files within its `work_dir`.
+    If the code generates 'output.png', it will be base64 encoded and returned.
+    """
+    python_executable = sys.executable
+    print(f"Executing Tool 'python_interpreter' in directory: {work_dir}")
+    work_path = Path(work_dir)
+    script_path = work_path / "agent_script.py"
+    plot_path = work_path / "output.png"
+    with open(script_path, "w") as f:
+        f.write(code)
+    print("\n\n--- 📜 DECODING: SCRIPT TO EXECUTE 📜 ---")
+    print(code)
+    print("------------------------------------------\n")
+    try:
+        python_executable = sys.executable
+        # +++ ADD THIS DEBUG LINE +++
+        print(f"--- [DEBUG] EXECUTING SUBPROCESS WITH PYTHON FROM: {python_executable} ---")
+        # +++++++++++++++++++++++++++
+        process = subprocess.run(
+            [python_executable, str(script_path)],
+            cwd=work_path, # Run the script from within the temp directory
+            capture_output=True,
+            text=True,
+            timeout=1000,
+            check=False
+        )
+        print("\n\n--- 📤 DECODING: SCRIPT RAW OUTPUT 📤 ---")
+        print(f"Return Code: {process.returncode}")
+        print("--- STDOUT ---")
+        print(process.stdout)
+        print("--- STDERR ---")
+        print(process.stderr)
+        print("------------------------------------------\n")
+        if process.returncode != 0:
+            return f"SCRIPT FAILED with return code {process.returncode}:\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}"
+        stdout = process.stdout
+        # Check if a plot was generated as output.png
+        if plot_path.exists():
+            with open(plot_path, "rb") as img_file:
+                img_base64 = base64.b64encode(img_file.read()).decode('utf-8')
+            # Prepend the plot's data URI to the stdout
+            plot_uri = f"data:image/png;base64,{img_base64}"
+            return f"image_output:\n{plot_uri}\n\ntext_output:\n{stdout}"
+        # If successful, just return the standard output
+        return process.stdout
+    except subprocess.CalledProcessError as e:
+        return f"SCRIPT FAILED:\n--- STDOUT ---\n{e.stdout}\n--- STDERR ---\n{e.stderr}"
+    except subprocess.TimeoutExpired:
+        return "Error: The Python script took too long to execute."
+    except Exception as e:
+        return f"An unexpected error occurred: {e}"