KarthikMuraliM commited on
Commit
8bcc812
·
1 Parent(s): 9f768b8

Feat: Add Code Interpreter for reliable data analysis

Browse files
Files changed (2) hide show
  1. app.py +75 -34
  2. tools.py +32 -2
app.py CHANGED
@@ -5,15 +5,22 @@ import openai
5
  import json
6
  import pandas as pd
7
 
8
- # Import our new set of tools
9
  import tools
10
 
 
11
  app = FastAPI()
 
 
 
12
  client = openai.OpenAI()
13
- tools.set_openai_client(client) # Give the tools module access to the client
 
 
14
 
15
  @app.get("/")
16
  async def read_root():
 
17
  return {"message": "Data Analyst Agent API is running!"}
18
 
19
  @app.post("/api/")
@@ -21,59 +28,93 @@ async def analyze_data(
21
  questions_file: UploadFile = File(..., alias="questions.txt"),
22
  files: List[UploadFile] = File([], alias="files"),
23
  ):
 
 
 
 
24
  questions_text = (await questions_file.read()).decode("utf-8")
25
 
 
26
  if "scrape" in questions_text.lower() and "http" in questions_text.lower():
 
 
 
 
 
27
  url = next((word for word in questions_text.split() if word.startswith("http")), None)
28
  if not url:
29
  return {"error": "Scraping task detected, but no URL was found."}
30
-
31
- # --- AGENT WORKFLOW ---
32
- # 1. PERCEIVE
33
- print(f"Step 1: Fetching dynamic HTML from {url}")
34
  html_content = await tools.get_dynamic_html(url)
35
- if "Error" in html_content:
36
  return {"error": html_content}
37
 
38
- # 2. DECIDE
39
- print("Step 2: Asking LLM to choose the best table index.")
40
- task_description = f"Find a table with the following information: {questions_text}"
41
- choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
42
-
43
  try:
44
  choice = json.loads(choice_json_str)
45
  if "error" in choice:
46
  return {"error": choice["error"]}
47
- table_index = choice.get("index") # Get the index from the LLM response
48
  if table_index is None or not isinstance(table_index, int):
49
  return {"error": "LLM failed to return a valid integer index for the table."}
50
- except json.JSONDecodeError:
51
  return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
52
 
53
- # 3. ACT
54
- print(f"Step 3: Extracting table with index '{table_index}'.")
55
- df_or_error = tools.extract_table_to_dataframe(html_content, table_index) # Use the index
56
- if isinstance(df_or_error, str):
57
- return {"error": df_or_error}
58
-
59
- # 4. ANALYSIS (This part is unchanged)
60
- print("Step 4: Analyzing data with LLM.")
61
- data_string = df_or_error.to_csv(index=False)
62
- if len(data_string) > 15000:
63
- data_string = df_or_error.head(50).to_csv(index=False)
64
-
 
65
  system_prompt = """
66
- You are an expert data analyst agent... Respond with a JSON object: {\"answers\": [...]}
 
 
 
 
 
 
 
 
 
67
  """
68
- user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
69
 
70
  try:
71
- completion = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
72
- response_data = json.loads(completion.choices[0].message.content)
73
- return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  except Exception as e:
75
- return {"error": f"Error during final analysis: {str(e)}"}
76
 
77
  else:
78
- # Handle non-scraping tasks here
79
- return {"response": "This is a non-scraping task."}
 
5
  import json
6
  import pandas as pd
7
 
8
+ # Import our agent's tools
9
  import tools
10
 
11
+ # Initialize FastAPI app
12
  app = FastAPI()
13
+
14
+ # Initialize the OpenAI client.
15
+ # It will automatically pick up credentials from Hugging Face Secrets.
16
  client = openai.OpenAI()
17
+
18
+ # Give the tools module access to the initialized OpenAI client
19
+ tools.set_openai_client(client)
20
 
21
  @app.get("/")
22
  async def read_root():
23
+ """A simple root endpoint to confirm the API is running."""
24
  return {"message": "Data Analyst Agent API is running!"}
25
 
26
  @app.post("/api/")
 
28
  questions_file: UploadFile = File(..., alias="questions.txt"),
29
  files: List[UploadFile] = File([], alias="files"),
30
  ):
31
+ """
32
+ Main endpoint to handle data analysis tasks. It orchestrates scraping,
33
+ data extraction, code generation, and code execution.
34
+ """
35
  questions_text = (await questions_file.read()).decode("utf-8")
36
 
37
+ # Simple router: Check if the task involves scraping a URL.
38
  if "scrape" in questions_text.lower() and "http" in questions_text.lower():
39
+
40
+ # --- AGENT WORKFLOW ---
41
+
42
+ # Step 1: PERCEIVE - Get the fully rendered HTML from the URL using Playwright
43
+ print("Step 1: Fetching dynamic HTML from URL...")
44
  url = next((word for word in questions_text.split() if word.startswith("http")), None)
45
  if not url:
46
  return {"error": "Scraping task detected, but no URL was found."}
47
+
 
 
 
48
  html_content = await tools.get_dynamic_html(url)
49
+ if isinstance(html_content, str) and "Error" in html_content:
50
  return {"error": html_content}
51
 
52
+ # Step 2: DECIDE - Ask the LLM to identify the best table to use for the task
53
+ print("Step 2: Asking LLM to choose the best table index...")
54
+ choice_json_str = tools.choose_best_table_from_html(html_content, questions_text)
 
 
55
  try:
56
  choice = json.loads(choice_json_str)
57
  if "error" in choice:
58
  return {"error": choice["error"]}
59
+ table_index = choice.get("index")
60
  if table_index is None or not isinstance(table_index, int):
61
  return {"error": "LLM failed to return a valid integer index for the table."}
62
+ except (json.JSONDecodeError, TypeError):
63
  return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
64
 
65
+ # Step 3: ACT (Extraction) - Extract the chosen table into a pandas DataFrame
66
+ print(f"Step 3: Extracting table with index '{table_index}'...")
67
+ df = tools.extract_table_to_dataframe(html_content, table_index)
68
+ if isinstance(df, str):
69
+ return {"error": df}
70
+
71
+ # --- STEP 4: GENERATE & EXECUTE PYTHON CODE ---
72
+ print("Step 4: Generating Python code for analysis...")
73
+
74
+ # Prepare a concise summary of the DataFrame for the LLM prompt
75
+ df_head = df.head().to_string()
76
+ df_info = f"Here is the head of the pandas DataFrame, named 'df':\n{df_head}"
77
+
78
  system_prompt = """
79
+ You are an expert Python data analyst. You are given a description of a pandas DataFrame named 'df' and a set of questions.
80
+ Your task is to write a single Python script to answer these questions.
81
+
82
+ Guidelines:
83
+ 1. The DataFrame 'df' is already loaded and available in your environment.
84
+ 2. First, you MUST perform data cleaning. Pay close attention to columns with symbols like '$', ',', or text that needs to be converted to numbers. Use `pd.to_numeric` and string manipulation (`.str.replace()`). Handle potential errors during conversion by using `errors='coerce'`.
85
+ 3. Address each question from the user clearly.
86
+ 4. Use the `print()` function to output the final answer for each question. Start each print statement with a clear label (e.g., "Answer 1:", "Answer 2:").
87
+ 5. Do not include any example usage, comments, or explanations outside of the Python code block.
88
+ 6. The final output of your script should be ONLY the Python code itself.
89
  """
90
+ user_prompt = f"{df_info}\n\nPlease write a Python script to answer the following questions:\n\n{questions_text}"
91
 
92
  try:
93
+ # Generate the Python code using the LLM
94
+ completion = client.chat.completions.create(
95
+ model="gpt-5-nano",
96
+ messages=[
97
+ {"role": "system", "content": system_prompt},
98
+ {"role": "user", "content": user_prompt}
99
+ ]
100
+ )
101
+ response_content = completion.choices[0].message.content
102
+
103
+ # Extract the code from the markdown block (e.g., ```python\n...\n```)
104
+ python_code = response_content.strip().replace("```python", "").replace("```", "").strip()
105
+
106
+ # Step 5: ACT (Execution) - Run the generated code using our tool
107
+ print("Step 5: Executing generated code.")
108
+ execution_result = tools.run_python_code_on_dataframe(df, python_code)
109
+
110
+ # The result is the captured print output. Format it into a JSON array of strings.
111
+ final_answers = [line for line in execution_result.strip().split('\n') if line.strip()]
112
+
113
+ return final_answers
114
+
115
  except Exception as e:
116
+ return {"error": f"An error occurred during code generation or execution: {str(e)}"}
117
 
118
  else:
119
+ # Handle non-scraping, general knowledge tasks
120
+ return {"response": "This is a non-scraping task."}
tools.py CHANGED
@@ -5,6 +5,9 @@ from bs4 import BeautifulSoup
5
  import json
6
  import openai
7
 
 
 
 
8
  client = None
9
  def set_openai_client(c):
10
  global client
@@ -57,7 +60,7 @@ def choose_best_table_from_html(html_content: str, task_description: str) -> str
57
 
58
  try:
59
  completion = client.chat.completions.create(
60
- model="gpt-4o",
61
  response_format={"type": "json_object"},
62
  messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
63
  )
@@ -81,4 +84,31 @@ def extract_table_to_dataframe(html_content: str, table_index: int) -> (pd.DataF
81
  return "Error: Pandas could not parse the selected table."
82
  return df_list[0]
83
  except Exception as e:
84
- return f"Error converting table to DataFrame: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import json
6
  import openai
7
 
8
+ import io
9
+ import sys
10
+ from contextlib import redirect_stdout
11
  client = None
12
  def set_openai_client(c):
13
  global client
 
60
 
61
  try:
62
  completion = client.chat.completions.create(
63
+ model="gpt-5-nano",
64
  response_format={"type": "json_object"},
65
  messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
66
  )
 
84
  return "Error: Pandas could not parse the selected table."
85
  return df_list[0]
86
  except Exception as e:
87
+ return f"Error converting table to DataFrame: {e}"
88
+
89
+
90
+ def run_python_code_on_dataframe(df: pd.DataFrame, python_code: str) -> str:
91
+ """
92
+ Executes Python code with a DataFrame named 'df' available in the local scope.
93
+ Captures and returns any output printed to stdout.
94
+ """
95
+ # Create a string stream to capture stdout
96
+ output_stream = io.StringIO()
97
+
98
+ # Create a local scope for the exec to run in, with 'df' pre-populated
99
+ local_scope = {'df': df}
100
+
101
+ try:
102
+ # Redirect stdout to our stream
103
+ with redirect_stdout(output_stream):
104
+ # Execute the code in the defined scope
105
+ exec(python_code, {'__builtins__': __builtins__}, local_scope)
106
+
107
+ # Get the captured output
108
+ result = output_stream.getvalue()
109
+ if not result:
110
+ return "Code executed successfully with no printed output."
111
+ return result
112
+
113
+ except Exception as e:
114
+ return f"Error executing code: {e}\n---\nCode that failed:\n{python_code}"