Spaces:
Sleeping
Sleeping
Commit ·
a523805
1
Parent(s): cfd667b
Implement dynamic, agentic scraping with Playwrighand debugged
Browse files
app.py
CHANGED
|
@@ -29,13 +29,14 @@ async def analyze_data(
|
|
| 29 |
return {"error": "Scraping task detected, but no URL was found."}
|
| 30 |
|
| 31 |
# --- AGENT WORKFLOW ---
|
| 32 |
-
# 1. PERCEIVE:
|
| 33 |
print(f"Step 1: Fetching dynamic HTML from {url}")
|
| 34 |
-
|
|
|
|
| 35 |
if "Error" in html_content:
|
| 36 |
return {"error": html_content}
|
| 37 |
|
| 38 |
-
# 2. DECIDE:
|
| 39 |
print("Step 2: Asking LLM to choose the best table.")
|
| 40 |
task_description = f"Find a table with the following information: {questions_text}"
|
| 41 |
choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
|
|
@@ -50,23 +51,29 @@ async def analyze_data(
|
|
| 50 |
except json.JSONDecodeError:
|
| 51 |
return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
|
| 52 |
|
| 53 |
-
# 3. ACT:
|
| 54 |
print(f"Step 3: Extracting table with selector '{selector}'.")
|
| 55 |
df_or_error = tools.extract_table_to_dataframe(html_content, selector)
|
| 56 |
if isinstance(df_or_error, str):
|
| 57 |
return {"error": df_or_error}
|
| 58 |
|
| 59 |
-
#
|
| 60 |
print("Step 4: Analyzing data with LLM.")
|
| 61 |
data_string = df_or_error.to_csv(index=False)
|
| 62 |
if len(data_string) > 15000:
|
| 63 |
data_string = df_or_error.head(50).to_csv(index=False)
|
| 64 |
|
| 65 |
-
system_prompt = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
|
| 67 |
|
| 68 |
try:
|
| 69 |
-
completion = client.chat.completions.create(model="gpt-
|
| 70 |
response_data = json.loads(completion.choices[0].message.content)
|
| 71 |
return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
|
| 72 |
except Exception as e:
|
|
|
|
| 29 |
return {"error": "Scraping task detected, but no URL was found."}
|
| 30 |
|
| 31 |
# --- AGENT WORKFLOW ---
|
| 32 |
+
# 1. PERCEIVE: Await the async function
|
| 33 |
print(f"Step 1: Fetching dynamic HTML from {url}")
|
| 34 |
+
# Add the 'await' keyword here!
|
| 35 |
+
html_content = await tools.get_dynamic_html(url)
|
| 36 |
if "Error" in html_content:
|
| 37 |
return {"error": html_content}
|
| 38 |
|
| 39 |
+
# 2. DECIDE: This tool is synchronous, so no await is needed
|
| 40 |
print("Step 2: Asking LLM to choose the best table.")
|
| 41 |
task_description = f"Find a table with the following information: {questions_text}"
|
| 42 |
choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
|
|
|
|
| 51 |
except json.JSONDecodeError:
|
| 52 |
return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
|
| 53 |
|
| 54 |
+
# 3. ACT: This tool is synchronous
|
| 55 |
print(f"Step 3: Extracting table with selector '{selector}'.")
|
| 56 |
df_or_error = tools.extract_table_to_dataframe(html_content, selector)
|
| 57 |
if isinstance(df_or_error, str):
|
| 58 |
return {"error": df_or_error}
|
| 59 |
|
| 60 |
+
# 4. ANALYSIS: The OpenAI call is synchronous in the SDK v1.0+
|
| 61 |
print("Step 4: Analyzing data with LLM.")
|
| 62 |
data_string = df_or_error.to_csv(index=False)
|
| 63 |
if len(data_string) > 15000:
|
| 64 |
data_string = df_or_error.head(50).to_csv(index=False)
|
| 65 |
|
| 66 |
+
system_prompt = """
|
| 67 |
+
You are an expert data analyst agent. You will be given a dataset in CSV format and a list of questions about it.
|
| 68 |
+
Your task is to answer the questions based *only* on the provided data.
|
| 69 |
+
Perform necessary data cleaning and type conversions mentally before answering (e.g., handle '$' signs, commas in numbers, dates).
|
| 70 |
+
Provide your final answers as a single JSON object, with a key named "answers" which contains a list of strings. Each string in the list should be the answer to one of the user's questions, in order.
|
| 71 |
+
For example: {"answers": ["Answer to Q1", "Answer to Q2"]}
|
| 72 |
+
"""
|
| 73 |
user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
|
| 74 |
|
| 75 |
try:
|
| 76 |
+
completion = client.chat.completions.create(model="gpt-5-nano", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
|
| 77 |
response_data = json.loads(completion.choices[0].message.content)
|
| 78 |
return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
|
| 79 |
except Exception as e:
|
tools.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# tools.py
|
| 2 |
import pandas as pd
|
| 3 |
-
from playwright.
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import json
|
| 6 |
import openai
|
|
@@ -12,19 +12,19 @@ def set_openai_client(c):
|
|
| 12 |
global client
|
| 13 |
client = c
|
| 14 |
|
| 15 |
-
def get_dynamic_html(url: str) -> str:
|
| 16 |
-
"""Fetches the fully rendered HTML of a page using Playwright."""
|
| 17 |
-
with
|
| 18 |
-
|
| 19 |
-
|
|
|
|
| 20 |
try:
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
html_content = page.content()
|
| 24 |
except Exception as e:
|
| 25 |
-
browser.close()
|
| 26 |
return f"Error fetching page with Playwright: {e}"
|
| 27 |
-
browser.close()
|
| 28 |
return html_content
|
| 29 |
|
| 30 |
def choose_best_table_from_html(html_content: str, task_description: str) -> str:
|
|
@@ -65,7 +65,7 @@ def choose_best_table_from_html(html_content: str, task_description: str) -> str
|
|
| 65 |
|
| 66 |
try:
|
| 67 |
completion = client.chat.completions.create(
|
| 68 |
-
model="gpt-
|
| 69 |
response_format={"type": "json_object"},
|
| 70 |
messages=[
|
| 71 |
{"role": "system", "content": system_prompt},
|
|
|
|
| 1 |
# tools.py
|
| 2 |
import pandas as pd
|
| 3 |
+
from playwright.async_api import async_playwright
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import json
|
| 6 |
import openai
|
|
|
|
| 12 |
global client
|
| 13 |
client = c
|
| 14 |
|
| 15 |
+
async def get_dynamic_html(url: str) -> str:
|
| 16 |
+
"""Fetches the fully rendered HTML of a page using Playwright's ASYNC API."""
|
| 17 |
+
# 'async with' is the asynchronous context manager
|
| 18 |
+
async with async_playwright() as p:
|
| 19 |
+
browser = await p.chromium.launch()
|
| 20 |
+
page = await browser.new_page()
|
| 21 |
try:
|
| 22 |
+
await page.goto(url, timeout=20000, wait_until='networkidle')
|
| 23 |
+
html_content = await page.content()
|
|
|
|
| 24 |
except Exception as e:
|
| 25 |
+
await browser.close()
|
| 26 |
return f"Error fetching page with Playwright: {e}"
|
| 27 |
+
await browser.close()
|
| 28 |
return html_content
|
| 29 |
|
| 30 |
def choose_best_table_from_html(html_content: str, task_description: str) -> str:
|
|
|
|
| 65 |
|
| 66 |
try:
|
| 67 |
completion = client.chat.completions.create(
|
| 68 |
+
model="gpt-5-nano",
|
| 69 |
response_format={"type": "json_object"},
|
| 70 |
messages=[
|
| 71 |
{"role": "system", "content": system_prompt},
|