Spaces:

Karthix1
/

data-analyst-agent

Sleeping

App Files Files Community

KarthikMuraliM commited on Sep 4, 2025

Commit

a523805

1 Parent(s): cfd667b

Implement dynamic, agentic scraping with Playwrighand debugged

Browse files

Files changed (2) hide show

app.py +14 -7
tools.py +12 -12

app.py CHANGED Viewed

@@ -29,13 +29,14 @@ async def analyze_data(
             return {"error": "Scraping task detected, but no URL was found."}
         # --- AGENT WORKFLOW ---
-        # 1. PERCEIVE: Get the full page content
         print(f"Step 1: Fetching dynamic HTML from {url}")
-        html_content = tools.get_dynamic_html(url)
         if "Error" in html_content:
             return {"error": html_content}
-        # 2. DECIDE: Ask LLM to choose the best table for the task
         print("Step 2: Asking LLM to choose the best table.")
         task_description = f"Find a table with the following information: {questions_text}"
         choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
@@ -50,23 +51,29 @@ async def analyze_data(
         except json.JSONDecodeError:
             return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
-        # 3. ACT: Extract the chosen table into a DataFrame
         print(f"Step 3: Extracting table with selector '{selector}'.")
         df_or_error = tools.extract_table_to_dataframe(html_content, selector)
         if isinstance(df_or_error, str):
             return {"error": df_or_error}
-        # --- ANALYSIS (same as before) ---
         print("Step 4: Analyzing data with LLM.")
         data_string = df_or_error.to_csv(index=False)
         if len(data_string) > 15000:
             data_string = df_or_error.head(50).to_csv(index=False)
-        system_prompt = "You are an expert data analyst... respond with a JSON object: {\"answers\": [...]}" # (Same prompt as before)
         user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
         try:
-            completion = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
             response_data = json.loads(completion.choices[0].message.content)
             return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
         except Exception as e:

             return {"error": "Scraping task detected, but no URL was found."}
         # --- AGENT WORKFLOW ---
+        # 1. PERCEIVE: Await the async function
         print(f"Step 1: Fetching dynamic HTML from {url}")
+        # Add the 'await' keyword here!
+        html_content = await tools.get_dynamic_html(url)
         if "Error" in html_content:
             return {"error": html_content}
+        # 2. DECIDE: This tool is synchronous, so no await is needed
         print("Step 2: Asking LLM to choose the best table.")
         task_description = f"Find a table with the following information: {questions_text}"
         choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
         except json.JSONDecodeError:
             return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
+        # 3. ACT: This tool is synchronous
         print(f"Step 3: Extracting table with selector '{selector}'.")
         df_or_error = tools.extract_table_to_dataframe(html_content, selector)
         if isinstance(df_or_error, str):
             return {"error": df_or_error}
+        # 4. ANALYSIS: The OpenAI call is synchronous in the SDK v1.0+
         print("Step 4: Analyzing data with LLM.")
         data_string = df_or_error.to_csv(index=False)
         if len(data_string) > 15000:
             data_string = df_or_error.head(50).to_csv(index=False)
+        system_prompt = """
+        You are an expert data analyst agent. You will be given a dataset in CSV format and a list of questions about it.
+        Your task is to answer the questions based *only* on the provided data.
+        Perform necessary data cleaning and type conversions mentally before answering (e.g., handle '$' signs, commas in numbers, dates).
+        Provide your final answers as a single JSON object, with a key named "answers" which contains a list of strings. Each string in the list should be the answer to one of the user's questions, in order.
+        For example: {"answers": ["Answer to Q1", "Answer to Q2"]}
+        """
         user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
         try:
+            completion = client.chat.completions.create(model="gpt-5-nano", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
             response_data = json.loads(completion.choices[0].message.content)
             return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
         except Exception as e:

tools.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # tools.py
 import pandas as pd
-from playwright.sync_api import sync_playwright
 from bs4 import BeautifulSoup
 import json
 import openai
@@ -12,19 +12,19 @@ def set_openai_client(c):
     global client
     client = c
-def get_dynamic_html(url: str) -> str:
-    """Fetches the fully rendered HTML of a page using Playwright."""
-    with sync_playwright() as p:
-        browser = p.chromium.launch()
-        page = browser.new_page()
         try:
-            # Use networkidle to wait for most dynamic content to load
-            page.goto(url, timeout=20000, wait_until='networkidle')
-            html_content = page.content()
         except Exception as e:
-            browser.close()
             return f"Error fetching page with Playwright: {e}"
-        browser.close()
         return html_content
 def choose_best_table_from_html(html_content: str, task_description: str) -> str:
@@ -65,7 +65,7 @@ def choose_best_table_from_html(html_content: str, task_description: str) -> str
     try:
         completion = client.chat.completions.create(
-            model="gpt-4o",
             response_format={"type": "json_object"},
             messages=[
                 {"role": "system", "content": system_prompt},

 # tools.py
 import pandas as pd
+from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup
 import json
 import openai
     global client
     client = c
+async def get_dynamic_html(url: str) -> str:
+    """Fetches the fully rendered HTML of a page using Playwright's ASYNC API."""
+    # 'async with' is the asynchronous context manager
+    async with async_playwright() as p:
+        browser = await p.chromium.launch()
+        page = await browser.new_page()
         try:
+            await page.goto(url, timeout=20000, wait_until='networkidle')
+            html_content = await page.content()
         except Exception as e:
+            await browser.close()
             return f"Error fetching page with Playwright: {e}"
+        await browser.close()
         return html_content
 def choose_best_table_from_html(html_content: str, task_description: str) -> str:
     try:
         completion = client.chat.completions.create(
+            model="gpt-5-nano",
             response_format={"type": "json_object"},
             messages=[
                 {"role": "system", "content": system_prompt},