KarthikMuraliM commited on
Commit
c0897c1
·
1 Parent(s): 553f95b

Initial commit from existing project

Browse files
Files changed (7) hide show
  1. .gitignore +130 -0
  2. DockerFile +46 -0
  3. LICENSE +21 -0
  4. README copy.md +1 -0
  5. main.py +229 -0
  6. requirements.txt +30 -0
  7. tools.py +758 -0
.gitignore ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ *.hypothesis/
50
+ .pytest_cache/
51
+
52
+ # Translations
53
+ *.mo
54
+ *.pot
55
+
56
+ # Django stuff:
57
+ *.log
58
+ local_settings.py
59
+ db.sqlite3
60
+ db.sqlite3-journal
61
+
62
+ # Flask stuff:
63
+ instance/
64
+ .webassets-cache
65
+
66
+ # Scrapy stuff:
67
+ .scrapy
68
+
69
+ # Sphinx documentation
70
+ docs/_build/
71
+
72
+ # PyBuilder
73
+ .pybuilder/
74
+ target/
75
+
76
+ # Jupyter Notebook
77
+ .ipynb_checkpoints
78
+
79
+ # IPython
80
+ profile_default/
81
+ ipython_config.py
82
+
83
+ # pyenv
84
+ .python-version
85
+
86
+ # pipenv
87
+ Pipfile.lock
88
+
89
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
90
+ __pypackages__/
91
+
92
+ # Celery stuff
93
+ celerybeat-schedule
94
+ celerybeat.pid
95
+
96
+ # SageMath parsed files
97
+ *.sage.py
98
+
99
+ # Environments
100
+ .env
101
+ .venv
102
+ env/
103
+ venv/
104
+ ENV/
105
+ env.bak/
106
+ venv.bak/
107
+
108
+ # Spyder project settings
109
+ .spyderproject
110
+ .spyproject
111
+
112
+ # Rope project settings
113
+ .ropeproject
114
+
115
+ # mkdocs documentation
116
+ /site
117
+
118
+ # mypy
119
+ .mypy_cache/
120
+ .dmypy.json
121
+ dmypy.json
122
+
123
+ # Pyre type checker
124
+ .pyre/
125
+
126
+ # pytype static type analyzer
127
+ .pytype/
128
+
129
+ # Cython debug symbols
130
+ cython_debug/
DockerFile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+
3
+ # Start from a standard Python 3.11 base image
4
+ FROM python:3.11-slim
5
+
6
+ # Set the working directory inside the container
7
+ WORKDIR /app
8
+
9
+ # 1. Install System-Level Dependencies
10
+ # We need Git (good practice), and Java for the tabula-py library.
11
+ # We also add 'curl' and 'unzip' which can be useful system tools.
12
+ RUN apt-get update && apt-get install -y --no-install-recommends \
13
+ curl \
14
+ unzip \
15
+ git \
16
+ default-jre \
17
+ && apt-get clean \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ # 2. Install 'uv', the fast Python package manager
21
+ # This makes the subsequent dependency installation much faster.
22
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh
23
+ ENV PATH="/root/.cargo/bin:${PATH}"
24
+
25
+ # 3. Install Python Dependencies using 'uv'
26
+ # Copy requirements file first to leverage Docker layer caching
27
+ COPY requirements.txt .
28
+ # Use 'uv pip install' which is a super-fast drop-in replacement for 'pip install'
29
+ RUN uv pip install --no-cache-dir -r requirements.txt
30
+
31
+ # 4. Install Playwright Browsers
32
+ # This command installs the necessary browsers for the scrape_dynamic_site tool
33
+ RUN playwright install
34
+
35
+ # 5. Copy Your Application Code
36
+ # Copy main.py, tools.py, etc., into the container's /app directory
37
+ COPY . .
38
+
39
+ # 6. Expose the Port
40
+ # Tell Docker that the container will listen on port 8000
41
+ EXPOSE 8000
42
+
43
+ # 7. Define the Startup Command
44
+ # This command runs your FastAPI server. The --host 0.0.0.0 is ESSENTIAL.
45
+ # We use 'uvicorn' here as 'uv' does not yet have a stable application server.
46
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 KarthikMurali-M
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README copy.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # Data-Analyst-Agent-API
main.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from fastapi import FastAPI, UploadFile, File, HTTPException, status
3
+ from typing import List,Dict
4
+ from pathlib import Path
5
+ import os
6
+ import openai
7
+ import tempfile # <-- Import tempfile
8
+ import json
9
+ from tools import fetch_url, python_interpreter,get_dataframe_info ,calculate_correlation,create_pivot_table,run_sql_query,get_sentiment,scrape_wikipedia_summary,scrape_pdf_tables ,analyze_image_content ,geocode_address ,scrape_dynamic_site , parse_html ,get_bbc_weather ,TOOL_DEFINITIONS
10
+ import asyncio
11
+ # Directory where files will be saved
12
+ UPLOAD_DIR = Path("uploaded_files")
13
+ UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
14
+ import re
15
+ import time
16
+ # --- Load Environment Variables ---
17
+ from dotenv import load_dotenv
18
+ load_dotenv()
19
+
20
+ if "AIPIPE_TOKEN" not in os.environ:
21
+ raise RuntimeError("The AIPIPE_TOKEN environment variable is not set. Please set it to your token from aipipe.org.")
22
+
23
+ # Configure the OpenAI client to point to the AI Pipe proxy.
24
+ # The client will automatically use the OPENAI_API_KEY environment variable,
25
+ # so we set it to our AI Pipe token.
26
+ client = openai.OpenAI(
27
+ base_url="https://aipipe.org/openrouter/v1",
28
+ api_key=os.getenv("AIPIPE_TOKEN"),
29
+ )
30
+
31
+ AVAILABLE_TOOLS: Dict[str, callable] = {
32
+ "fetch_url": fetch_url,
33
+ "python_interpreter": python_interpreter,
34
+ "get_dataframe_info": get_dataframe_info,
35
+ "calculate_correlation": calculate_correlation,
36
+ "create_pivot_table": create_pivot_table,
37
+ "run_sql_query": run_sql_query,
38
+ "get_sentiment": get_sentiment,
39
+ "scrape_wikipedia_summary": scrape_wikipedia_summary,
40
+ "scrape_pdf_tables": scrape_pdf_tables,
41
+ "analyze_image_content": analyze_image_content,
42
+ "geocode_address": geocode_address,
43
+ "scrape_dynamic_site": scrape_dynamic_site,
44
+ "parse_html": parse_html,
45
+ "get_bbc_weather": get_bbc_weather
46
+ }
47
+
48
+
49
+ def is_output_valid(result: str | None) -> bool:
50
+ """A simple validator to check if the agent's output is complete."""
51
+ if result is None or result.strip() == "":
52
+ return False
53
+ try:
54
+ data = json.loads(result)
55
+ # Check for common failure indicators like null values or "N/A"
56
+ if isinstance(data, list) and any(x is None or "N/A" in str(x) for x in data):
57
+ return False
58
+ if isinstance(data, dict) and any(v is None for v in data.values()):
59
+ return False
60
+ except (json.JSONDecodeError, TypeError):
61
+ return False # Not valid JSON
62
+ return True
63
+
64
+
65
+ app = FastAPI(
66
+ title="Data Analyst Agent API",
67
+ description="An API that uses LLMs to source, prepare, analyze, and visualize data. ;By 24f2001293@ds.study.iitm.ac.in",
68
+ )
69
+
70
+
71
+ @app.post("/api/")
72
+ async def process_analysis_request(files: List[UploadFile] = File(...)):
73
+ # This is the main function that will be rewritten.
74
+
75
+ max_retries = 3
76
+ last_error = ""
77
+ last_plan = {}
78
+
79
+ with tempfile.TemporaryDirectory() as work_dir:
80
+ # --- 1. FILE HANDLING (Using the Robust In-Memory Method) ---
81
+ work_path = Path(work_dir)
82
+ if not files: raise HTTPException(status.HTTP_400_BAD_REQUEST, "No files uploaded.")
83
+
84
+ file_contents = {f.filename: await f.read() for f in files}
85
+
86
+ questions_file_name, attached_file_names = None, []
87
+ first_txt_file_name = None
88
+ q_pattern = re.compile(r'question', re.IGNORECASE)
89
+
90
+ for filename in file_contents.keys():
91
+ is_txt = filename.lower().endswith(".txt")
92
+ if is_txt and first_txt_file_name is None:
93
+ first_txt_file_name = filename
94
+ if is_txt and q_pattern.search(filename):
95
+ if questions_file_name is None: questions_file_name = filename
96
+ else: attached_file_names.append(filename)
97
+ else:
98
+ attached_file_names.append(filename)
99
+
100
+ if questions_file_name is None:
101
+ questions_file_name = first_txt_file_name
102
+ if questions_file_name in attached_file_names:
103
+ attached_file_names.remove(questions_file_name)
104
+
105
+ if not questions_file_name: raise HTTPException(status.HTTP_400_BAD_REQUEST, "No .txt question file found.")
106
+
107
+ task_content = file_contents[questions_file_name].decode("utf-8")
108
+
109
+ for filename, content in file_contents.items():
110
+ file_path = work_path / filename
111
+ with open(file_path, "wb") as f: f.write(content)
112
+ # ----------------------------------------------------
113
+ time.sleep(0.1)
114
+ # --- SELF-CORRECTION LOOP ---
115
+ for i in range(max_retries):
116
+ print(f"\n--- AGENT ATTEMPT #{i + 1} ---")
117
+
118
+ # --- 2. PLANNING ---
119
+ AVAILABLE_TOOLS: Dict[str, callable] = { "fetch_url": fetch_url, "python_interpreter": python_interpreter, "get_dataframe_info": get_dataframe_info, "calculate_correlation": calculate_correlation, "create_pivot_table": create_pivot_table, "run_sql_query": run_sql_query, "get_sentiment": get_sentiment, "scrape_wikipedia_summary": scrape_wikipedia_summary, "scrape_pdf_tables": scrape_pdf_tables, "analyze_image_content": analyze_image_content, "geocode_address": geocode_address, "scrape_dynamic_site": scrape_dynamic_site, "parse_html": parse_html, "get_bbc_weather": get_bbc_weather, }
120
+ planner_system_prompt = f"""
121
+ You are an expert-level data analysis planner. Your purpose is to convert a user's request into a step-by-step JSON execution plan.
122
+
123
+ You have been provided with the following available tools:
124
+ {", ".join(AVAILABLE_TOOLS.keys())}
125
+
126
+ You must decide on the best strategy to fulfill the request. You have two strategies available:
127
+
128
+ **Strategy 1: Use a Specialized Tool.**
129
+ If the user's request can be answered directly and completely by a single call to one of the specialized tools (e.g., `get_bbc_weather`, `geocode_address`, `get_sentiment`), you MUST generate a simple, one-step plan that calls that tool. This is your preferred strategy for simple, direct requests. USE THE SAME NAMES GIVEN TO YOU TO CREATE THE TOOL CALLS.
130
+
131
+ **Strategy 2: Generate a Single Python Script.**
132
+ If the user's request is complex, requires multiple steps, data manipulation, or cannot be handled by a single specialized tool, you MUST generate a plan containing a SINGLE step that uses the `python_interpreter`. This single step must contain a complete, self-contained Python script that performs all the necessary actions and prints the final JSON output.
133
+
134
+ CRITICAL RULES:
135
+ 1. **CHOOSE A STRATEGY:** First, analyze the request. Is it a simple task for a specialized tool, or a complex one requiring a full script?
136
+ 2. **TOOL NAMES:** You MUST use the exact tool names from the provided list.
137
+ 3. **DATA CLEANING (IMPORTANT):** When you load a CSV file using pandas, the column names might have leading/trailing whitespace. Your first step after loading the data MUST be to clean the column names. A good method is: `df.columns = df.columns.str.strip()`.
138
+ 4. **FINAL OUTPUT (CRITICAL):** You MUST read the user's request very carefully to determine the exact final output format.
139
+ - If the user asks for a **JSON object with specific keys**, your script's final print statement MUST produce a JSON object with EXACTLY those keys and data types.
140
+ - If the user asks for a **JSON array**, your script's final print statement MUST produce a JSON array with the raw values in the correct order.
141
+ 5. **NO PLACEHOLDERS:** You MUST perform the actual calculations and data analysis required. Do not use placeholder or example values in your final output. The results must be derived from the provided data sources.
142
+ 6. Your entire output MUST be ONLY a valid JSON object representing the execution plan. The plan should follow this schema: {{"plan": {{"steps": [{{...}}]}}}}
143
+ """
144
+
145
+
146
+ # On retries (i > 0), add the context of the last failure
147
+ if i > 0:
148
+ user_prompt = f"The previous attempt failed.\nPREVIOUS PLAN:\n{json.dumps(last_plan, indent=2)}\n\nPREVIOUS ERROR/OUTPUT:\n{last_error}\n\nPlease analyze the error and generate a new, corrected plan to fulfill the original request:\n{task_content}"
149
+ else:
150
+ user_prompt = f"--- USER REQUEST ---\n{task_content}\n\n--- AVAILABLE FILES ---\n{attached_file_names}"
151
+
152
+ print("--- Calling Planner LLM to create execution plan ---")
153
+ planner_messages = [{"role": "system", "content": planner_system_prompt}, {"role": "user", "content": user_prompt}]
154
+
155
+ try:
156
+ response = client.chat.completions.create(model="openai/gpt-5-nano", messages=planner_messages, response_format={"type": "json_object"})
157
+ plan_str = response.choices[0].message.content
158
+ plan = json.loads(plan_str)
159
+ print("\n\n--- 🕵️ DECODING: PLAN RECEIVED 🕵️ ---")
160
+ print(json.dumps(plan, indent=2))
161
+ print("-----------------------------------------\n")
162
+
163
+ last_plan = plan
164
+ print("--- Plan received from Planner ---")
165
+ except Exception as e:
166
+ last_error = f"Planner failed to generate a valid JSON plan: {e}"
167
+ if i < max_retries - 1: continue # Go to the next retry attempt
168
+ else: break # Exit loop if retries are exhausted
169
+
170
+ # --- 3. EXECUTION ---
171
+ print("--- Starting Worker execution ---")
172
+ final_result = None
173
+ try:
174
+ plan_steps = plan.get("plan", {}).get("steps", [])
175
+ if not plan_steps: raise ValueError("The generated plan contains no steps.")
176
+
177
+ for step_data in plan_steps:
178
+ # Use the final, robust worker logic from our last iteration
179
+ tool_name = step_data.get("tool_name", step_data.get("tool", step_data.get("action", step_data.get("name"))))
180
+ if not tool_name: raise ValueError("Plan step is missing a 'tool' or 'action' key.")
181
+
182
+ tool_function = AVAILABLE_TOOLS.get(tool_name)
183
+ if not tool_function: raise ValueError(f"Plan requested an unknown tool: '{tool_name}'")
184
+
185
+ known_keys = ["step", "id", "tool", "tool_name", "action", "description", "notes", "output"]
186
+ arguments = {k: v for k, v in step_data.items() if k not in known_keys}
187
+ if "script" in arguments and "code" not in arguments: arguments["code"] = arguments.pop("script")
188
+
189
+ # Add special context arguments
190
+ if tool_name in ["python_interpreter", "get_dataframe_info", "calculate_correlation", "create_pivot_table", "scrape_pdf_tables", "analyze_image_content", "scrape_dynamic_site", "parse_html"]: arguments["work_dir"] = work_dir
191
+ if tool_name in ["get_sentiment", "analyze_image_content"]: arguments["client"] = client
192
+
193
+ # Execute the tool
194
+ if asyncio.iscoroutinefunction(tool_function): output = await tool_function(**arguments)
195
+ else: output = tool_function(**arguments)
196
+ final_result = output
197
+
198
+ # --- 4. VALIDATION ---
199
+ if is_output_valid(final_result):
200
+ print("--- Output is valid. Task complete. ---")
201
+ print("\n\n--- ✅ DECODING: FINAL VALID OUTPUT ✅ ---")
202
+ print(final_result)
203
+ print("------------------------------------------\n")
204
+
205
+ return json.loads(final_result)
206
+ else:
207
+ print("--- Output is invalid. Triggering self-correction. ---")
208
+
209
+ last_error = f"The script executed but produced an invalid result: {final_result}"
210
+
211
+ except Exception as e:
212
+ print(f"--- Execution failed. Triggering self-correction. ---")
213
+ last_error = f"The worker failed to execute the plan. Error: {repr(e)}"
214
+
215
+ # If all retries fail, raise the final error
216
+ print(f"--- AGENT FAILED: All {max_retries} attempts exhausted. Returning empty JSON. ---")
217
+ print(f"Last known error was: {last_error}")
218
+
219
+ # We need to figure out the expected format (list or dict) from the question.
220
+ # A simple heuristic: if the question asks for a JSON object, return {}.
221
+ # If it asks for a JSON array, return [].
222
+ if "JSON object" in task_content:
223
+ return {}
224
+ elif "JSON array" in task_content:
225
+ return []
226
+ else:
227
+ # A safe default if the format is not specified
228
+ return {}
229
+
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ openai
3
+ requests
4
+ uvicorn
5
+ python-dotenv
6
+ python-multipart
7
+ pandas
8
+ matplotlib
9
+ numpy
10
+ scikit-learn
11
+ pandas
12
+ pyarrow
13
+ tabula-py
14
+ Pillow
15
+ beautifulsoup4
16
+ lxml
17
+ httpx
18
+ scikit-learn
19
+ geopy
20
+ networkx
21
+ duckdb-engine
22
+ sqlalchemy
23
+ mysql-connector-python
24
+ wikipedia
25
+ tabula-py
26
+ playwright
27
+ beautifulsoup4
28
+ lxml
29
+ boto3
30
+ pyarrow
tools.py ADDED
@@ -0,0 +1,758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tools.py
2
+ import subprocess
3
+ import requests
4
+ import base64
5
+ from pathlib import Path
6
+ import sys
7
+ import json
8
+ import pandas as pd
9
+ from typing import List,Dict
10
+
11
+ import io
12
+ from sqlalchemy import create_engine, text
13
+ import openai
14
+ import wikipedia
15
+ import numpy as np
16
+ import tabula
17
+ from PIL import Image
18
+ import base64
19
+ from geopy.geocoders import Nominatim
20
+ from playwright.async_api import async_playwright
21
+ import asyncio
22
+ from bs4 import BeautifulSoup
23
+ import requests
24
+
25
+
26
+
27
+
28
+ TOOL_DEFINITIONS = [
29
+ {
30
+ "type": "function",
31
+ "function": {
32
+ "name": "fetch_url",
33
+ "description": "Fetches the text content from a given URL. Use this for scraping websites or getting data from online sources.",
34
+ "parameters": {
35
+ "type": "object",
36
+ "properties": {
37
+ "url": {
38
+ "type": "string",
39
+ "description": "The complete URL to fetch content from.",
40
+ },
41
+ },
42
+ "required": ["url"],
43
+ },
44
+ },
45
+ },
46
+ {
47
+ "type": "function",
48
+ "function": {
49
+ "name": "python_interpreter",
50
+ "description": (
51
+ "Executes Python code in an isolated environment for data analysis, manipulation, and visualization. "
52
+ "The environment has pandas, matplotlib, numpy, and scikit-learn available. "
53
+ "The code can access user-uploaded files directly by their filename (e.g., pd.read_csv('data.csv')). "
54
+ "To return a plot, save it as 'output.png'. All print() output is captured as the result."
55
+ ),
56
+ "parameters": {
57
+ "type": "object",
58
+ "properties": {
59
+ "code": {
60
+ "type": "string",
61
+ "description": "The Python code to execute.",
62
+ },
63
+ },
64
+ "required": ["code"],
65
+ },
66
+ },
67
+ },
68
+ {
69
+ "type": "function",
70
+ "function": {
71
+ "name": "get_dataframe_info",
72
+ "description": "Reads a data file (like a .csv or .parquet) and returns a JSON summary including column names, data types, non-null counts, and descriptive statistics (mean, std, min, max, etc.). This is the best first step for understanding any dataset.",
73
+ "parameters": {
74
+ "type": "object",
75
+ "properties": {
76
+ "file_path": {
77
+ "type": "string",
78
+ "description": "The filename of the data file to analyze (e.g., 'data.csv').",
79
+ },
80
+ },
81
+ "required": ["file_path"],
82
+ },
83
+ },
84
+ },
85
+ {
86
+ "type": "function",
87
+ "function": {
88
+ "name": "calculate_correlation",
89
+ "description": "Computes the Pearson correlation coefficient between two specific numerical columns in a given data file. The name of this function is `calculate_correlation`.",
90
+ "parameters": {
91
+ "type": "object",
92
+ "properties": {
93
+ "file_path": {"type": "string", "description": "The filename of the data file (e.g., 'data.csv')."},
94
+ "column1": {"type": "string", "description": "The name of the first column."},
95
+ "column2": {"type": "string", "description": "The name of the second column."},
96
+ },
97
+ "required": ["file_path", "column1", "column2"],
98
+ },
99
+ },
100
+ },
101
+ {
102
+ "type": "function",
103
+ "function": {
104
+ "name": "create_pivot_table",
105
+ "description": "Generates a pivot table to summarize data. This function takes a file and the names of the columns to use for the index, columns, and values of the pivot table. The name of this function is `create_pivot_table`.",
106
+ "parameters": {
107
+ "type": "object",
108
+ "properties": {
109
+ "file_path": {"type": "string", "description": "The filename of the data file (e.g., 'data.csv')."},
110
+ "index": {"type": "string", "description": "The name of the column to use as the pivot table's index (rows)."},
111
+ "columns": {"type": "string", "description": "The name of the column to use as the pivot table's columns."},
112
+ "values": {"type": "string", "description": "The name of the column to aggregate as the values in the pivot table."},
113
+ },
114
+ "required": ["file_path", "index", "columns", "values"],
115
+ },
116
+ },
117
+ },
118
+ {
119
+ "type": "function",
120
+ "function": {
121
+ "name": "run_sql_query",
122
+ "description": "Executes a SQL query against a database (like SQLite or DuckDB) and returns the result as JSON. The name of this function is `run_sql_query`.",
123
+ "parameters": {
124
+ "type": "object",
125
+ "properties": {
126
+ "query": {
127
+ "type": "string",
128
+ "description": "The SQL query to execute.",
129
+ },
130
+ "db_connection_string": {
131
+ "type": "string",
132
+ "description": "The SQLAlchemy connection string for the database. For an uploaded SQLite file named 'my_db.db', use 'sqlite:///my_db.db'. For a DuckDB file, use 'duckdb:///my_db.duckdb'.",
133
+ },
134
+ },
135
+ "required": ["query", "db_connection_string"],
136
+ },
137
+ },
138
+ },
139
+ {
140
+ "type": "function",
141
+ "function": {
142
+ "name": "get_sentiment",
143
+ "description": "Analyzes a piece of text (like a movie review) to determine if its sentiment is positive, negative, or neutral. The name of this function is `get_sentiment`.",
144
+ "parameters": {
145
+ "type": "object",
146
+ "properties": {
147
+ "text_to_analyze": {
148
+ "type": "string",
149
+ "description": "The text content to be analyzed.",
150
+ },
151
+ },
152
+ "required": ["text_to_analyze"],
153
+ },
154
+ },
155
+ },
156
+ {
157
+ "type": "function",
158
+ "function": {
159
+ "name": "scrape_wikipedia_summary",
160
+ "description": "Fetches the clean text summary from a Wikipedia page. Use this tool specifically for getting information from Wikipedia. The name of this function is `scrape_wikipedia_summary`.",
161
+ "parameters": {
162
+ "type": "object",
163
+ "properties": {
164
+ "query": {
165
+ "type": "string",
166
+ "description": "The title or search query for the Wikipedia page (e.g., 'Python (programming language)').",
167
+ },
168
+ },
169
+ "required": ["query"],
170
+ },
171
+ },
172
+ },
173
+ {
174
+ "type": "function",
175
+ "function": {
176
+ "name": "scrape_pdf_tables",
177
+ "description": "Extracts all tabular data from a PDF document and returns it as a list of JSON objects. Use this for any PDF that contains tables. The name of this function is `scrape_pdf_tables`.",
178
+ "parameters": {
179
+ "type": "object",
180
+ "properties": {
181
+ "file_path": {
182
+ "type": "string",
183
+ "description": "The filename of the PDF file to process (e.g., 'report.pdf').",
184
+ },
185
+ },
186
+ "required": ["file_path"],
187
+ },
188
+ },
189
+ },
190
+ {
191
+ "type": "function",
192
+ "function": {
193
+ "name": "analyze_image_content",
194
+ "description": "Analyzes an uploaded image file (e.g., a PNG or JPG) and answers a specific question about its contents. Use this to identify objects, read text, or describe scenes in an image. The name of this function is `analyze_image_content`.",
195
+ "parameters": {
196
+ "type": "object",
197
+ "properties": {
198
+ "image_path": {"type": "string", "description": "The filename of the image to analyze (e.g., 'chart.png')."},
199
+ "prompt": {"type": "string", "description": "The specific question to ask about the image (e.g., 'What is the title of this chart?', 'Is there a cat in this picture?')."},
200
+ },
201
+ "required": ["image_path", "prompt"],
202
+ },
203
+ },
204
+ },
205
+ {
206
+ "type": "function",
207
+ "function": {
208
+ "name": "geocode_address",
209
+ "description": "Finds the geographic coordinates (latitude and longitude) for a given street address, city, or landmark. Uses the Nominatim service. The name of this function is `geocode_address`.",
210
+ "parameters": {
211
+ "type": "object",
212
+ "properties": {
213
+ "address": {
214
+ "type": "string",
215
+ "description": "The address or place name to geocode (e.g., '1600 Amphitheatre Parkway, Mountain View, CA' or 'Tokyo Tower').",
216
+ },
217
+ },
218
+ "required": ["address"],
219
+ },
220
+ },
221
+ },
222
+ {
223
+ "type": "function",
224
+ "function": {
225
+ "name": "scrape_dynamic_site",
226
+ "description":
227
+ "Renders a JavaScript-heavy website and saves the complete HTML to a file named 'scraped_page.html'. This is the first step in a two-step process. After calling this, use the 'parse_html' tool to extract specific data from the saved file. The name of this function is `scrape_dynamic_site`.", "parameters": {
228
+ "type": "object",
229
+ "properties": {
230
+ "url": {"type": "string", "description": "The URL of the dynamic website to scrape."},
231
+ },
232
+ "required": ["url"],
233
+ },
234
+ },
235
+ },
236
+ {
237
+ "type": "function",
238
+ "function": {
239
+ "name": "parse_html",
240
+ "description": "Extracts specific data from an HTML file (like one saved by 'scrape_dynamic_site') using CSS selectors. Provide a dictionary where keys are desired data names and values are the CSS selectors to find that data. The name of this function is `parse_html`.",
241
+ "parameters": {
242
+ "type": "object",
243
+ "properties": {
244
+ "file_path": {"type": "string", "description": "The local filename of the HTML file to parse (e.g., 'scraped_page.html')."},
245
+ "selectors": {
246
+ "type": "object",
247
+ "description": "A JSON object of 'data_name': 'css_selector' pairs. For example: {\"titles\": \"h2.product-title\", \"prices\": \".price-tag\"}",
248
+ },
249
+ },
250
+ "required": ["file_path", "selectors"],
251
+ },
252
+ },
253
+ },
254
+ {
255
+ "type": "function",
256
+ "function": {
257
+ "name": "get_bbc_weather",
258
+ "description": "Fetches the weather forecast for a location using its BBC Weather ID. Can provide a 3-day summary or a detailed hour-by-hour forecast. The name of this function is `get_bbc_weather`.",
259
+ "parameters": {
260
+ "type": "object",
261
+ "properties": {
262
+ "location_id": {
263
+ "type": "string",
264
+ "description": "The numerical ID for the location (e.g., '2643743' for London).",
265
+ },
266
+ "report_type": {
267
+ "type": "string",
268
+ "description": "The type of report to generate. Use 'summary' for a 3-day overview or 'detailed' for an hour-by-hour forecast.",
269
+ "enum": ["summary", "detailed"], # 'enum' helps the LLM choose a valid option
270
+ },
271
+ },
272
+ "required": ["location_id"],
273
+ },
274
+ },
275
+ },
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+ ]
285
+
286
+
287
+ def get_bbc_weather(location_id: str, report_type: str = 'summary') -> str:
288
+ """
289
+ Fetches the weather forecast for a given BBC Weather location ID.
290
+ Can return a 'summary' (default) or a 'detailed' hour-by-hour report.
291
+ """
292
+ print(f"Executing Tool 'get_bbc_weather' for ID: {location_id}, Type: {report_type}")
293
+
294
+ url = f"https://weather-broker-cdn.api.bbci.co.uk/en/forecast/aggregated/{location_id}"
295
+
296
+ try:
297
+ response = requests.get(url, timeout=15)
298
+ response.raise_for_status()
299
+ weather_data = response.json()
300
+
301
+ forecasts_data = weather_data.get("forecasts", [])
302
+ if not forecasts_data:
303
+ return "Error: Forecast data not found in the API response."
304
+
305
+ report = forecasts_data[0]
306
+ location_name = report.get("location", {}).get("name")
307
+
308
+ # --- NEW LOGIC ---
309
+ if report_type == 'detailed':
310
+ # Extract the detailed, timeseries forecast
311
+ detailed_forecast = {
312
+ "location_name": location_name,
313
+ "issued_at": report.get("issuedAt"),
314
+ "detailed_forecast": []
315
+ }
316
+ for slot in report.get("detailed", {}).get("reports", []):
317
+ hour_summary = {
318
+ "timestamp": slot.get("localDate"),
319
+ "temperature_c": slot.get("temperatureC"),
320
+ "feels_like_temp_c": slot.get("feelsLikeTempC"),
321
+ "wind_speed_mph": slot.get("windSpeedMph"),
322
+ "wind_direction": slot.get("windDirectionAbbreviation"),
323
+ "precipitation_probability_percent": slot.get("precipitationProbabilityInPercent"),
324
+ "weather_type": slot.get("weatherType")
325
+ }
326
+ detailed_forecast["detailed_forecast"].append(hour_summary)
327
+ return json.dumps(detailed_forecast, indent=2)
328
+
329
+ else: # Default to 'summary'
330
+ # The existing summary logic
331
+ summary_report = {
332
+ "location_name": location_name,
333
+ "issued_at": report.get("issuedAt"),
334
+ "daily_summary": []
335
+ }
336
+ for day in report.get("summary", {}).get("reports", []):
337
+ day_summary = {
338
+ "date": day.get("localDate"),
339
+ "condition": day.get("weatherType"),
340
+ "max_temp_c": day.get("maxTempC"),
341
+ "min_temp_c": day.get("minTempC"),
342
+ }
343
+ summary_report["daily_summary"].append(day_summary)
344
+ return json.dumps(summary_report, indent=2)
345
+
346
+ except Exception as e:
347
+ return f"An error occurred while processing weather data. Error: {e}"
348
+
349
+
350
+ def parse_html(file_path: str, selectors: Dict[str, str], work_dir: str) -> str:
351
+ """
352
+ Parses a local HTML file and extracts data using a dictionary of CSS selectors.
353
+ For each key-value pair in the selectors dictionary, it finds elements matching
354
+ the selector (value) and stores their text content under the given key.
355
+ """
356
+ print(f"Executing Tool 'parse_html' for file: {file_path}")
357
+ full_path = Path(work_dir) / file_path
358
+ if not full_path.exists():
359
+ return f"Error: HTML file not found at {full_path}"
360
+
361
+ try:
362
+ with open(full_path, "r", encoding="utf-8") as f:
363
+ html_content = f.read()
364
+
365
+ soup = BeautifulSoup(html_content, "lxml")
366
+
367
+ extracted_data = {}
368
+ for data_key, selector in selectors.items():
369
+ # Find all elements matching the selector
370
+ elements = soup.select(selector)
371
+ # Extract the text from each element, stripping whitespace
372
+ extracted_data[data_key] = [el.get_text(strip=True) for el in elements]
373
+
374
+ return json.dumps(extracted_data, indent=2)
375
+
376
+ except Exception as e:
377
+ return f"Failed to parse HTML file {file_path}. Error: {e}"
378
+
379
+
380
+ async def scrape_dynamic_site(url: str, work_dir: str) -> str:
381
+ """
382
+ Renders a JavaScript-heavy website using a headless browser and saves the
383
+ complete, final HTML to a file named 'scraped_page.html'.
384
+ """
385
+ print(f"Executing Tool 'scrape_dynamic_site' for url: {url}")
386
+ save_path = Path(work_dir) / "scraped_page.html"
387
+
388
+ try:
389
+ async with async_playwright() as p:
390
+ browser = await p.chromium.launch()
391
+ page = await browser.new_page()
392
+ await page.goto(url, wait_until='networkidle', timeout=30000) # 30s timeout
393
+ content = await page.content()
394
+ await browser.close()
395
+
396
+ # Save the full HTML content to the specified file
397
+ with open(save_path, "w", encoding="utf-8") as f:
398
+ f.write(content)
399
+
400
+ # Return a success message with the path to the saved file
401
+ return json.dumps({
402
+ "status": "success",
403
+ "url": url,
404
+ "saved_to": str(save_path.name) # Return just the filename
405
+ })
406
+
407
+ except Exception as e:
408
+ return f"Failed to scrape dynamic site {url}. Error: {e}"
409
+
410
+
411
+ def geocode_address(address: str) -> str:
412
+ """
413
+ Converts a physical address or place name into geographic coordinates (latitude and longitude).
414
+ """
415
+ print(f"Executing Tool 'geocode_address' for address: {address}")
416
+ try:
417
+ # Create a geolocator instance. A unique user_agent is good practice.
418
+ geolocator = Nominatim(user_agent="data_analyst_agent_v1")
419
+
420
+ location = geolocator.geocode(address)
421
+
422
+ if location is None:
423
+ return f"Error: Could not find coordinates for the address '{address}'."
424
+
425
+ result = {
426
+ "address": address,
427
+ "latitude": location.latitude,
428
+ "longitude": location.longitude,
429
+ "full_address_found": location.address
430
+ }
431
+
432
+ return json.dumps(result, indent=2)
433
+
434
+ except Exception as e:
435
+ return f"Failed to geocode address. Error: {e}"
436
+
437
+
438
+
439
+ def analyze_image_content(image_path: str, prompt: str, work_dir: str, client: openai.Client) -> str:
440
+ """
441
+ Analyzes the content of an image file using a multimodal LLM and answers a question about it.
442
+ """
443
+ print(f"Executing Tool 'analyze_image_content' for file: {image_path}")
444
+ full_path = Path(work_dir) / image_path
445
+ if not full_path.exists():
446
+ return f"Error: Image file not found at {full_path}"
447
+
448
+ try:
449
+ # Open the image to verify it's a valid image file (optional but good practice)
450
+ Image.open(full_path)
451
+
452
+ # Encode the image to base64
453
+ with open(full_path, "rb") as image_file:
454
+ base64_image = base64.b64encode(image_file.read()).decode('utf-8')
455
+
456
+ # Call the multimodal model
457
+ response = client.chat.completions.create(
458
+ model="openai/gpt-4.1-nano",
459
+ messages=[
460
+ {
461
+ "role": "user",
462
+ "content": [
463
+ {"type": "text", "text": prompt},
464
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
465
+ ],
466
+ }
467
+ ],
468
+ max_tokens=500, # Allow for a reasonably detailed description
469
+ )
470
+ description = response.choices[0].message.content
471
+ return json.dumps({"image": image_path, "analysis": description})
472
+
473
+ except Exception as e:
474
+ return f"Failed to analyze image. Error: {e}"
475
+
476
+
477
+ def scrape_wikipedia_summary(query: str) -> str:
478
+ """
479
+ Fetches the summary section of a Wikipedia page based on a search query.
480
+ """
481
+ print(f"Executing Tool 'scrape_wikipedia_summary' for query: {query}")
482
+ try:
483
+ # Fetch the summary of the page
484
+ summary = wikipedia.summary(query, auto_suggest=True)
485
+
486
+ result = {
487
+ "query": query,
488
+ "summary": summary
489
+ }
490
+ return json.dumps(result, indent=2)
491
+
492
+ except wikipedia.exceptions.PageError:
493
+ return f"Error: Could not find a Wikipedia page for the query '{query}'."
494
+ except wikipedia.exceptions.DisambiguationError as e:
495
+ return f"Error: The query '{query}' is ambiguous. It could refer to any of the following: {e.options}"
496
+ except Exception as e:
497
+ return f"Failed to scrape Wikipedia. Error: {e}"
498
+
499
+
500
+ def scrape_pdf_tables(file_path: str, work_dir: str) -> str:
501
+ """
502
+ Extracts all tables from a specified page in a PDF file.
503
+ """
504
+ print(f"Executing Tool 'scrape_pdf_tables' for file: {file_path}")
505
+ full_path = Path(work_dir) / file_path
506
+ if not full_path.exists():
507
+ return f"Error: PDF file not found at {full_path}"
508
+
509
+ try:
510
+ # read_pdf returns a list of DataFrames, one for each table found
511
+ tables_as_dfs = tabula.read_pdf(full_path, pages='all', multiple_tables=True)
512
+
513
+ if not tables_as_dfs:
514
+ return "No tables were found in the PDF file."
515
+
516
+ # Convert each DataFrame in the list to a JSON string
517
+ tables_as_json = [df.to_json(orient='split') for df in tables_as_dfs]
518
+
519
+ # Return a JSON object containing the list of tables
520
+ return json.dumps({"file_name": file_path, "extracted_tables": tables_as_json})
521
+
522
+ except Exception as e:
523
+ return f"Failed to scrape tables from PDF. Make sure Java is installed on the system. Error: {e}"
524
+
525
+
526
+ def get_sentiment(text_to_analyze: str, client: openai.Client) -> str:
527
+ """
528
+ Analyzes the sentiment of a given piece of text.
529
+ """
530
+ print(f"Executing Tool 'get_sentiment'")
531
+
532
+ try:
533
+ # We use a specific, constrained prompt to force the LLM to be a classifier
534
+ response = client.chat.completions.create(
535
+ model="openai/gpt-5-nano", # Use a fast and cheap model for this simple task
536
+ messages=[
537
+ {"role": "system", "content": "You are a sentiment analysis tool. Classify the user's text as 'positive', 'negative', or 'neutral'. Respond with only one of these three words and nothing else."},
538
+ {"role": "user", "content": text_to_analyze}
539
+ ],
540
+ max_tokens=5, # Limit the output to a single word
541
+ temperature=0.0 # Make the output deterministic
542
+ )
543
+ sentiment = response.choices[0].message.content.lower().strip()
544
+
545
+ # Basic validation
546
+ if sentiment not in ["positive", "negative", "neutral"]:
547
+ return "Error: Could not determine a valid sentiment."
548
+
549
+ return json.dumps({"text": text_to_analyze, "sentiment": sentiment})
550
+
551
+ except Exception as e:
552
+ return f"Failed to get sentiment. Error: {e}"
553
+
554
+
555
+ def run_sql_query(query: str, db_connection_string: str) -> str:
556
+ """
557
+ Executes a SQL query against a specified database and returns the result.
558
+ Supports file-based databases like SQLite and DuckDB.
559
+ For SQLite, the connection string should be 'sqlite:///path/to/database.db'.
560
+ The path should be relative to the agent's working directory.
561
+ """
562
+ print(f"Executing Tool 'run_sql_query'")
563
+
564
+ try:
565
+ # Create a database engine from the connection string
566
+ engine = create_engine(db_connection_string)
567
+
568
+ # Execute the query and fetch results into a pandas DataFrame
569
+ with engine.connect() as connection:
570
+ result_df = pd.read_sql_query(sql=text(query), con=connection)
571
+
572
+ # Return the result as a JSON string
573
+ return result_df.to_json(orient="records")
574
+
575
+ except Exception as e:
576
+ return f"Failed to execute SQL query. Error: {e}"
577
+
578
+
579
+ def calculate_correlation(file_path: str, column1: str, column2: str, work_dir: str) -> str:
580
+ """
581
+ Calculates the Pearson correlation coefficient between two specified columns in a data file.
582
+ """
583
+ print(f"Executing Tool 'calculate_correlation' for file: {file_path}")
584
+ full_path = Path(work_dir) / file_path
585
+ if not full_path.exists():
586
+ return f"Error: Data file not found at {full_path}"
587
+
588
+ try:
589
+ if file_path.lower().endswith('.csv'):
590
+ df = pd.read_csv(full_path)
591
+ elif file_path.lower().endswith('.parquet'):
592
+ df = pd.read_parquet(full_path)
593
+ else:
594
+ return f"Error: Unsupported file type."
595
+
596
+ # Ensure columns exist
597
+ if column1 not in df.columns or column2 not in df.columns:
598
+ return f"Error: One or both columns ('{column1}', '{column2}') not found in the file."
599
+
600
+ # Calculate correlation
601
+ correlation = df[column1].corr(df[column2])
602
+
603
+ result = {
604
+ "file_name": file_path,
605
+ "column_1": column1,
606
+ "column_2": column2,
607
+ "pearson_correlation": correlation
608
+ }
609
+
610
+ return json.dumps(result, indent=2)
611
+
612
+ except Exception as e:
613
+ return f"Failed to calculate correlation. Error: {e}"
614
+
615
+ def create_pivot_table(file_path: str, index: str, columns: str, values: str, work_dir: str) -> str:
616
+ """
617
+ Creates a pivot table from the data in the specified file.
618
+ """
619
+ print(f"Executing Tool 'create_pivot_table' for file: {file_path}")
620
+ full_path = Path(work_dir) / file_path
621
+ if not full_path.exists():
622
+ return f"Error: Data file not found at {full_path}"
623
+
624
+ try:
625
+ if file_path.lower().endswith('.csv'):
626
+ df = pd.read_csv(full_path)
627
+ elif file_path.lower().endswith('.parquet'):
628
+ df = pd.read_parquet(full_path)
629
+ else:
630
+ return f"Error: Unsupported file type."
631
+
632
+ # Create the pivot table
633
+ pivot_table = pd.pivot_table(df, values=values, index=index, columns=columns, aggfunc=np.sum)
634
+
635
+ # Return the pivot table as a JSON string
636
+ return pivot_table.to_json(orient="split")
637
+
638
+ except Exception as e:
639
+ return f"Failed to create pivot table. Error: {e}"
640
+
641
+ def get_dataframe_info(file_path: str, work_dir: str) -> str:
642
+ """
643
+ Reads a data file (CSV, Parquet) and returns a summary of its contents.
644
+ The summary includes column names, data types, and basic statistics.
645
+ """
646
+ print(f"Executing Tool 'get_dataframe_info' for file: {file_path}")
647
+ full_path = Path(work_dir) / file_path
648
+ if not full_path.exists():
649
+ return f"Error: Data file not found at {full_path}"
650
+
651
+ try:
652
+ if file_path.lower().endswith('.csv'):
653
+ df = pd.read_csv(full_path)
654
+ elif file_path.lower().endswith('.parquet'):
655
+ df = pd.read_parquet(full_path)
656
+ else:
657
+ return f"Error: Unsupported file type. Only .csv and .parquet are supported."
658
+
659
+ # Use a string buffer to capture the output of df.info()
660
+ info_buffer = io.StringIO()
661
+ df.info(buf=info_buffer)
662
+ info_str = info_buffer.getvalue()
663
+
664
+ # Get the statistical summary
665
+ describe_df = df.describe(include='all')
666
+
667
+ # Combine everything into a single, informative string
668
+ summary = {
669
+ "file_name": file_path,
670
+ "info": info_str,
671
+ "statistical_summary": describe_df.to_json(orient="split")
672
+ }
673
+
674
+ return json.dumps(summary, indent=2)
675
+
676
+ except Exception as e:
677
+ return f"Failed to get DataFrame info. Error: {e}"
678
+
679
+ def fetch_url(url: str) -> str:
680
+ """Fetches text content from a specified URL using the AI Pipe proxy."""
681
+ print(f"Executing Tool 'fetch_url' with URL: {url}")
682
+ try:
683
+ proxy_url = f"https://aipipe.org/proxy/{url}"
684
+ response = requests.get(proxy_url, timeout=30)
685
+ response.raise_for_status()
686
+ return response.text
687
+ except requests.RequestException as e:
688
+ return f"Error: Failed to fetch URL {url}. Reason: {e}"
689
+
690
+ def python_interpreter(code: str, work_dir: str) -> str:
691
+ """
692
+ Executes Python code in a sandboxed subprocess within a specific working directory.
693
+
694
+ The code can access any files within its `work_dir`.
695
+ If the code generates 'output.png', it will be base64 encoded and returned.
696
+ """
697
+ python_executable = sys.executable
698
+
699
+ print(f"Executing Tool 'python_interpreter' in directory: {work_dir}")
700
+ work_path = Path(work_dir)
701
+ script_path = work_path / "agent_script.py"
702
+ plot_path = work_path / "output.png"
703
+
704
+ with open(script_path, "w") as f:
705
+ f.write(code)
706
+ print("\n\n--- 📜 DECODING: SCRIPT TO EXECUTE 📜 ---")
707
+ print(code)
708
+ print("------------------------------------------\n")
709
+
710
+ try:
711
+ python_executable = sys.executable
712
+
713
+ # +++ ADD THIS DEBUG LINE +++
714
+ print(f"--- [DEBUG] EXECUTING SUBPROCESS WITH PYTHON FROM: {python_executable} ---")
715
+ # +++++++++++++++++++++++++++
716
+
717
+ process = subprocess.run(
718
+ [python_executable, str(script_path)],
719
+ cwd=work_path, # Run the script from within the temp directory
720
+ capture_output=True,
721
+ text=True,
722
+ timeout=1000,
723
+ check=False
724
+ )
725
+ print("\n\n--- 📤 DECODING: SCRIPT RAW OUTPUT 📤 ---")
726
+ print(f"Return Code: {process.returncode}")
727
+ print("--- STDOUT ---")
728
+ print(process.stdout)
729
+ print("--- STDERR ---")
730
+ print(process.stderr)
731
+ print("------------------------------------------\n")
732
+ if process.returncode != 0:
733
+ return f"SCRIPT FAILED with return code {process.returncode}:\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}"
734
+ stdout = process.stdout
735
+
736
+ # Check if a plot was generated as output.png
737
+ if plot_path.exists():
738
+ with open(plot_path, "rb") as img_file:
739
+ img_base64 = base64.b64encode(img_file.read()).decode('utf-8')
740
+ # Prepend the plot's data URI to the stdout
741
+ plot_uri = f"data:image/png;base64,{img_base64}"
742
+ return f"image_output:\n{plot_uri}\n\ntext_output:\n{stdout}"
743
+
744
+ # If successful, just return the standard output
745
+ return process.stdout
746
+
747
+
748
+
749
+
750
+ except subprocess.CalledProcessError as e:
751
+ return f"SCRIPT FAILED:\n--- STDOUT ---\n{e.stdout}\n--- STDERR ---\n{e.stderr}"
752
+ except subprocess.TimeoutExpired:
753
+ return "Error: The Python script took too long to execute."
754
+ except Exception as e:
755
+ return f"An unexpected error occurred: {e}"
756
+
757
+
758
+