Spaces:
Sleeping
Sleeping
Commit ·
9f768b8
1
Parent(s): 8de87af
Fix: Refactor scraping to use table index instead of custom selector
Browse files
app.py
CHANGED
|
@@ -29,15 +29,14 @@ async def analyze_data(
|
|
| 29 |
return {"error": "Scraping task detected, but no URL was found."}
|
| 30 |
|
| 31 |
# --- AGENT WORKFLOW ---
|
| 32 |
-
# 1. PERCEIVE
|
| 33 |
print(f"Step 1: Fetching dynamic HTML from {url}")
|
| 34 |
-
# Add the 'await' keyword here!
|
| 35 |
html_content = await tools.get_dynamic_html(url)
|
| 36 |
if "Error" in html_content:
|
| 37 |
return {"error": html_content}
|
| 38 |
|
| 39 |
-
# 2. DECIDE
|
| 40 |
-
print("Step 2: Asking LLM to choose the best table.")
|
| 41 |
task_description = f"Find a table with the following information: {questions_text}"
|
| 42 |
choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
|
| 43 |
|
|
@@ -45,35 +44,31 @@ async def analyze_data(
|
|
| 45 |
choice = json.loads(choice_json_str)
|
| 46 |
if "error" in choice:
|
| 47 |
return {"error": choice["error"]}
|
| 48 |
-
|
| 49 |
-
if not
|
| 50 |
-
return {"error": "LLM failed to return a valid
|
| 51 |
except json.JSONDecodeError:
|
| 52 |
return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
|
| 53 |
|
| 54 |
-
# 3. ACT
|
| 55 |
-
print(f"Step 3: Extracting table with
|
| 56 |
-
df_or_error = tools.extract_table_to_dataframe(html_content,
|
| 57 |
if isinstance(df_or_error, str):
|
| 58 |
return {"error": df_or_error}
|
| 59 |
|
| 60 |
-
# 4. ANALYSIS
|
| 61 |
print("Step 4: Analyzing data with LLM.")
|
| 62 |
data_string = df_or_error.to_csv(index=False)
|
| 63 |
if len(data_string) > 15000:
|
| 64 |
data_string = df_or_error.head(50).to_csv(index=False)
|
| 65 |
|
| 66 |
system_prompt = """
|
| 67 |
-
You are an expert data analyst agent.
|
| 68 |
-
Your task is to answer the questions based *only* on the provided data.
|
| 69 |
-
Perform necessary data cleaning and type conversions mentally before answering (e.g., handle '$' signs, commas in numbers, dates).
|
| 70 |
-
Provide your final answers as a single JSON object, with a key named "answers" which contains a list of strings. Each string in the list should be the answer to one of the user's questions, in order.
|
| 71 |
-
For example: {"answers": ["Answer to Q1", "Answer to Q2"]}
|
| 72 |
"""
|
| 73 |
user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
|
| 74 |
|
| 75 |
try:
|
| 76 |
-
completion = client.chat.completions.create(model="gpt-
|
| 77 |
response_data = json.loads(completion.choices[0].message.content)
|
| 78 |
return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
|
| 79 |
except Exception as e:
|
|
@@ -81,4 +76,4 @@ async def analyze_data(
|
|
| 81 |
|
| 82 |
else:
|
| 83 |
# Handle non-scraping tasks here
|
| 84 |
-
return {"response": "This is a non-scraping task."}
|
|
|
|
| 29 |
return {"error": "Scraping task detected, but no URL was found."}
|
| 30 |
|
| 31 |
# --- AGENT WORKFLOW ---
|
| 32 |
+
# 1. PERCEIVE
|
| 33 |
print(f"Step 1: Fetching dynamic HTML from {url}")
|
|
|
|
| 34 |
html_content = await tools.get_dynamic_html(url)
|
| 35 |
if "Error" in html_content:
|
| 36 |
return {"error": html_content}
|
| 37 |
|
| 38 |
+
# 2. DECIDE
|
| 39 |
+
print("Step 2: Asking LLM to choose the best table index.")
|
| 40 |
task_description = f"Find a table with the following information: {questions_text}"
|
| 41 |
choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
|
| 42 |
|
|
|
|
| 44 |
choice = json.loads(choice_json_str)
|
| 45 |
if "error" in choice:
|
| 46 |
return {"error": choice["error"]}
|
| 47 |
+
table_index = choice.get("index") # Get the index from the LLM response
|
| 48 |
+
if table_index is None or not isinstance(table_index, int):
|
| 49 |
+
return {"error": "LLM failed to return a valid integer index for the table."}
|
| 50 |
except json.JSONDecodeError:
|
| 51 |
return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
|
| 52 |
|
| 53 |
+
# 3. ACT
|
| 54 |
+
print(f"Step 3: Extracting table with index '{table_index}'.")
|
| 55 |
+
df_or_error = tools.extract_table_to_dataframe(html_content, table_index) # Use the index
|
| 56 |
if isinstance(df_or_error, str):
|
| 57 |
return {"error": df_or_error}
|
| 58 |
|
| 59 |
+
# 4. ANALYSIS (This part is unchanged)
|
| 60 |
print("Step 4: Analyzing data with LLM.")
|
| 61 |
data_string = df_or_error.to_csv(index=False)
|
| 62 |
if len(data_string) > 15000:
|
| 63 |
data_string = df_or_error.head(50).to_csv(index=False)
|
| 64 |
|
| 65 |
system_prompt = """
|
| 66 |
+
You are an expert data analyst agent... Respond with a JSON object: {\"answers\": [...]}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
"""
|
| 68 |
user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
|
| 69 |
|
| 70 |
try:
|
| 71 |
+
completion = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
|
| 72 |
response_data = json.loads(completion.choices[0].message.content)
|
| 73 |
return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
|
| 74 |
except Exception as e:
|
|
|
|
| 76 |
|
| 77 |
else:
|
| 78 |
# Handle non-scraping tasks here
|
| 79 |
+
return {"response": "This is a non-scraping task."}
|