KarthikMuraliM's picture
Final attempt: Use minimalist, aggressive prompt
f757484
raw
history blame
5.94 kB
# app.py
from fastapi import FastAPI, File, UploadFile, Form
from typing import List
import openai
import json
import pandas as pd
# Import our agent's tools
import tools
# Initialize FastAPI app
app = FastAPI()
# Initialize the OpenAI client.
# It will automatically pick up credentials from Hugging Face Secrets.
client = openai.OpenAI()
# Give the tools module access to the initialized OpenAI client
tools.set_openai_client(client)
@app.get("/")
async def read_root():
"""A simple root endpoint to confirm the API is running."""
return {"message": "Data Analyst Agent API is running!"}
@app.post("/api/")
async def analyze_data(
questions_file: UploadFile = File(..., alias="questions.txt"),
files: List[UploadFile] = File([], alias="files"),
):
"""
Main endpoint to handle data analysis tasks. It orchestrates scraping,
data extraction, code generation, and code execution.
"""
questions_text = (await questions_file.read()).decode("utf-8")
# Simple router: Check if the task involves scraping a URL.
if "scrape" in questions_text.lower() and "http" in questions_text.lower():
# --- AGENT WORKFLOW ---
# Step 1: PERCEIVE - Get the fully rendered HTML from the URL using Playwright
print("Step 1: Fetching dynamic HTML from URL...")
url = next((word for word in questions_text.split() if word.startswith("http")), None)
if not url:
return {"error": "Scraping task detected, but no URL was found."}
html_content = await tools.get_dynamic_html(url)
if isinstance(html_content, str) and "Error" in html_content:
return {"error": html_content}
# Step 2: DECIDE - Ask the LLM to identify the best table to use for the task
print("Step 2: Asking LLM to choose the best table index...")
choice_json_str = tools.choose_best_table_from_html(html_content, questions_text)
try:
choice = json.loads(choice_json_str)
if "error" in choice:
return {"error": choice["error"]}
table_index = choice.get("index")
if table_index is None or not isinstance(table_index, int):
return {"error": "LLM failed to return a valid integer index for the table."}
except (json.JSONDecodeError, TypeError):
return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
# Step 3: ACT (Extraction) - Extract the chosen table into a pandas DataFrame
print(f"Step 3: Extracting table with index '{table_index}'...")
df = tools.extract_table_to_dataframe(html_content, table_index)
if isinstance(df, str):
return {"error": df}
# --- STEP 4: GENERATE & EXECUTE PYTHON CODE ---
print("Step 4: Generating Python code for analysis...")
# Prepare a concise summary of the DataFrame for the LLM prompt
df_head = df.head().to_string()
df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
# system_prompt = """
# You are an AI data analyst. Your ONLY task is to write a Python script that operates on a pre-existing pandas DataFrame named `df`.
# **URGENT AND CRITICAL INSTRUCTION:**
# DO NOT write any code to read or load data (e.g., from a URL or file). The DataFrame `df` is ALREADY in memory. Start your script as if `df` is already defined.
# **Your script MUST:**
# 1. Perform data cleaning on the `df` DataFrame. Columns that look like numbers may be strings with '$' or ',' symbols.
# 2. For EACH question the user asks, you MUST `print()` the final answer.
# 3. Your entire output must be ONLY the raw Python code. No markdown, no comments, no explanations.
# """
system_prompt = """
You are a Python script generator. Your only output is code.
A pandas DataFrame named `df` and the following libraries are pre-loaded: `pd`, `re`, `plt`, `sns`, `np`, `io`, `base64`, `LinearRegression`.
**CRITICAL:**
- DO NOT import any libraries.
- DO NOT load any data.
- Write a script that cleans the `df` DataFrame and then prints the answers to the user's questions.
- For plots, print a base64 data URI using the provided recipe.
"""
user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
try:
# Generate the Python code using the LLM
completion = client.chat.completions.create(
model="gpt-5-nano",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
response_content = completion.choices[0].message.content
# Extract the code from the markdown block (e.g., ```python\n...\n```)
python_code = response_content.strip().replace("```python", "").replace("```", "").strip()
# Step 5: ACT (Execution) - Run the generated code using our tool
print(f"--- Generated Python Code ---\n{python_code}\n-----------------------------")
print("Step 5: Executing generated code.")
execution_result = tools.run_python_code_on_dataframe(df, python_code)
# The result is the captured print output. Format it into a JSON array of strings.
final_answers = [line for line in execution_result.strip().split('\n') if line.strip()]
return final_answers
except Exception as e:
return {"error": f"An error occurred during code generation or execution: {str(e)}"}
else:
# Handle non-scraping, general knowledge tasks
return {"response": "This is a non-scraping task."}