KarthikMuraliM commited on
Commit
a523805
·
1 Parent(s): cfd667b

Implement dynamic, agentic scraping with Playwrighand debugged

Browse files
Files changed (2) hide show
  1. app.py +14 -7
  2. tools.py +12 -12
app.py CHANGED
@@ -29,13 +29,14 @@ async def analyze_data(
29
  return {"error": "Scraping task detected, but no URL was found."}
30
 
31
  # --- AGENT WORKFLOW ---
32
- # 1. PERCEIVE: Get the full page content
33
  print(f"Step 1: Fetching dynamic HTML from {url}")
34
- html_content = tools.get_dynamic_html(url)
 
35
  if "Error" in html_content:
36
  return {"error": html_content}
37
 
38
- # 2. DECIDE: Ask LLM to choose the best table for the task
39
  print("Step 2: Asking LLM to choose the best table.")
40
  task_description = f"Find a table with the following information: {questions_text}"
41
  choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
@@ -50,23 +51,29 @@ async def analyze_data(
50
  except json.JSONDecodeError:
51
  return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
52
 
53
- # 3. ACT: Extract the chosen table into a DataFrame
54
  print(f"Step 3: Extracting table with selector '{selector}'.")
55
  df_or_error = tools.extract_table_to_dataframe(html_content, selector)
56
  if isinstance(df_or_error, str):
57
  return {"error": df_or_error}
58
 
59
- # --- ANALYSIS (same as before) ---
60
  print("Step 4: Analyzing data with LLM.")
61
  data_string = df_or_error.to_csv(index=False)
62
  if len(data_string) > 15000:
63
  data_string = df_or_error.head(50).to_csv(index=False)
64
 
65
- system_prompt = "You are an expert data analyst... respond with a JSON object: {\"answers\": [...]}" # (Same prompt as before)
 
 
 
 
 
 
66
  user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
67
 
68
  try:
69
- completion = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
70
  response_data = json.loads(completion.choices[0].message.content)
71
  return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
72
  except Exception as e:
 
29
  return {"error": "Scraping task detected, but no URL was found."}
30
 
31
  # --- AGENT WORKFLOW ---
32
+ # 1. PERCEIVE: Await the async function
33
  print(f"Step 1: Fetching dynamic HTML from {url}")
34
+ # Add the 'await' keyword here!
35
+ html_content = await tools.get_dynamic_html(url)
36
  if "Error" in html_content:
37
  return {"error": html_content}
38
 
39
+ # 2. DECIDE: This tool is synchronous, so no await is needed
40
  print("Step 2: Asking LLM to choose the best table.")
41
  task_description = f"Find a table with the following information: {questions_text}"
42
  choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
 
51
  except json.JSONDecodeError:
52
  return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
53
 
54
+ # 3. ACT: This tool is synchronous
55
  print(f"Step 3: Extracting table with selector '{selector}'.")
56
  df_or_error = tools.extract_table_to_dataframe(html_content, selector)
57
  if isinstance(df_or_error, str):
58
  return {"error": df_or_error}
59
 
60
+ # 4. ANALYSIS: The OpenAI call is synchronous in the SDK v1.0+
61
  print("Step 4: Analyzing data with LLM.")
62
  data_string = df_or_error.to_csv(index=False)
63
  if len(data_string) > 15000:
64
  data_string = df_or_error.head(50).to_csv(index=False)
65
 
66
+ system_prompt = """
67
+ You are an expert data analyst agent. You will be given a dataset in CSV format and a list of questions about it.
68
+ Your task is to answer the questions based *only* on the provided data.
69
+ Perform necessary data cleaning and type conversions mentally before answering (e.g., handle '$' signs, commas in numbers, dates).
70
+ Provide your final answers as a single JSON object, with a key named "answers" which contains a list of strings. Each string in the list should be the answer to one of the user's questions, in order.
71
+ For example: {"answers": ["Answer to Q1", "Answer to Q2"]}
72
+ """
73
  user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
74
 
75
  try:
76
+ completion = client.chat.completions.create(model="gpt-5-nano", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
77
  response_data = json.loads(completion.choices[0].message.content)
78
  return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
79
  except Exception as e:
tools.py CHANGED
@@ -1,6 +1,6 @@
1
  # tools.py
2
  import pandas as pd
3
- from playwright.sync_api import sync_playwright
4
  from bs4 import BeautifulSoup
5
  import json
6
  import openai
@@ -12,19 +12,19 @@ def set_openai_client(c):
12
  global client
13
  client = c
14
 
15
- def get_dynamic_html(url: str) -> str:
16
- """Fetches the fully rendered HTML of a page using Playwright."""
17
- with sync_playwright() as p:
18
- browser = p.chromium.launch()
19
- page = browser.new_page()
 
20
  try:
21
- # Use networkidle to wait for most dynamic content to load
22
- page.goto(url, timeout=20000, wait_until='networkidle')
23
- html_content = page.content()
24
  except Exception as e:
25
- browser.close()
26
  return f"Error fetching page with Playwright: {e}"
27
- browser.close()
28
  return html_content
29
 
30
  def choose_best_table_from_html(html_content: str, task_description: str) -> str:
@@ -65,7 +65,7 @@ def choose_best_table_from_html(html_content: str, task_description: str) -> str
65
 
66
  try:
67
  completion = client.chat.completions.create(
68
- model="gpt-4o",
69
  response_format={"type": "json_object"},
70
  messages=[
71
  {"role": "system", "content": system_prompt},
 
1
  # tools.py
2
  import pandas as pd
3
+ from playwright.async_api import async_playwright
4
  from bs4 import BeautifulSoup
5
  import json
6
  import openai
 
12
  global client
13
  client = c
14
 
15
+ async def get_dynamic_html(url: str) -> str:
16
+ """Fetches the fully rendered HTML of a page using Playwright's ASYNC API."""
17
+ # 'async with' is the asynchronous context manager
18
+ async with async_playwright() as p:
19
+ browser = await p.chromium.launch()
20
+ page = await browser.new_page()
21
  try:
22
+ await page.goto(url, timeout=20000, wait_until='networkidle')
23
+ html_content = await page.content()
 
24
  except Exception as e:
25
+ await browser.close()
26
  return f"Error fetching page with Playwright: {e}"
27
+ await browser.close()
28
  return html_content
29
 
30
  def choose_best_table_from_html(html_content: str, task_description: str) -> str:
 
65
 
66
  try:
67
  completion = client.chat.completions.create(
68
+ model="gpt-5-nano",
69
  response_format={"type": "json_object"},
70
  messages=[
71
  {"role": "system", "content": system_prompt},