KarthikMuraliM commited on
Commit
9f768b8
·
1 Parent(s): 8de87af

Fix: Refactor scraping to use table index instead of custom selector

Browse files
Files changed (1) hide show
  1. app.py +13 -18
app.py CHANGED
@@ -29,15 +29,14 @@ async def analyze_data(
29
  return {"error": "Scraping task detected, but no URL was found."}
30
 
31
  # --- AGENT WORKFLOW ---
32
- # 1. PERCEIVE: Await the async function
33
  print(f"Step 1: Fetching dynamic HTML from {url}")
34
- # Add the 'await' keyword here!
35
  html_content = await tools.get_dynamic_html(url)
36
  if "Error" in html_content:
37
  return {"error": html_content}
38
 
39
- # 2. DECIDE: This tool is synchronous, so no await is needed
40
- print("Step 2: Asking LLM to choose the best table.")
41
  task_description = f"Find a table with the following information: {questions_text}"
42
  choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
43
 
@@ -45,35 +44,31 @@ async def analyze_data(
45
  choice = json.loads(choice_json_str)
46
  if "error" in choice:
47
  return {"error": choice["error"]}
48
- selector = choice.get("selector")
49
- if not selector:
50
- return {"error": "LLM failed to return a valid selector."}
51
  except json.JSONDecodeError:
52
  return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
53
 
54
- # 3. ACT: This tool is synchronous
55
- print(f"Step 3: Extracting table with selector '{selector}'.")
56
- df_or_error = tools.extract_table_to_dataframe(html_content, selector)
57
  if isinstance(df_or_error, str):
58
  return {"error": df_or_error}
59
 
60
- # 4. ANALYSIS: The OpenAI call is synchronous in the SDK v1.0+
61
  print("Step 4: Analyzing data with LLM.")
62
  data_string = df_or_error.to_csv(index=False)
63
  if len(data_string) > 15000:
64
  data_string = df_or_error.head(50).to_csv(index=False)
65
 
66
  system_prompt = """
67
- You are an expert data analyst agent. You will be given a dataset in CSV format and a list of questions about it.
68
- Your task is to answer the questions based *only* on the provided data.
69
- Perform necessary data cleaning and type conversions mentally before answering (e.g., handle '$' signs, commas in numbers, dates).
70
- Provide your final answers as a single JSON object, with a key named "answers" which contains a list of strings. Each string in the list should be the answer to one of the user's questions, in order.
71
- For example: {"answers": ["Answer to Q1", "Answer to Q2"]}
72
  """
73
  user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
74
 
75
  try:
76
- completion = client.chat.completions.create(model="gpt-5-nano", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
77
  response_data = json.loads(completion.choices[0].message.content)
78
  return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
79
  except Exception as e:
@@ -81,4 +76,4 @@ async def analyze_data(
81
 
82
  else:
83
  # Handle non-scraping tasks here
84
- return {"response": "This is a non-scraping task."}
 
29
  return {"error": "Scraping task detected, but no URL was found."}
30
 
31
  # --- AGENT WORKFLOW ---
32
+ # 1. PERCEIVE
33
  print(f"Step 1: Fetching dynamic HTML from {url}")
 
34
  html_content = await tools.get_dynamic_html(url)
35
  if "Error" in html_content:
36
  return {"error": html_content}
37
 
38
+ # 2. DECIDE
39
+ print("Step 2: Asking LLM to choose the best table index.")
40
  task_description = f"Find a table with the following information: {questions_text}"
41
  choice_json_str = tools.choose_best_table_from_html(html_content, task_description)
42
 
 
44
  choice = json.loads(choice_json_str)
45
  if "error" in choice:
46
  return {"error": choice["error"]}
47
+ table_index = choice.get("index") # Get the index from the LLM response
48
+ if table_index is None or not isinstance(table_index, int):
49
+ return {"error": "LLM failed to return a valid integer index for the table."}
50
  except json.JSONDecodeError:
51
  return {"error": f"Failed to decode LLM response for table choice: {choice_json_str}"}
52
 
53
+ # 3. ACT
54
+ print(f"Step 3: Extracting table with index '{table_index}'.")
55
+ df_or_error = tools.extract_table_to_dataframe(html_content, table_index) # Use the index
56
  if isinstance(df_or_error, str):
57
  return {"error": df_or_error}
58
 
59
+ # 4. ANALYSIS (This part is unchanged)
60
  print("Step 4: Analyzing data with LLM.")
61
  data_string = df_or_error.to_csv(index=False)
62
  if len(data_string) > 15000:
63
  data_string = df_or_error.head(50).to_csv(index=False)
64
 
65
  system_prompt = """
66
+ You are an expert data analyst agent... Respond with a JSON object: {\"answers\": [...]}
 
 
 
 
67
  """
68
  user_prompt = f"Data:\n{data_string}\n\nQuestions:\n{questions_text}"
69
 
70
  try:
71
+ completion = client.chat.completions.create(model="gpt-4o", response_format={"type": "json_object"}, messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}])
72
  response_data = json.loads(completion.choices[0].message.content)
73
  return response_data.get("answers", {"error": "LLM did not return answers in the expected format."})
74
  except Exception as e:
 
76
 
77
  else:
78
  # Handle non-scraping tasks here
79
+ return {"response": "This is a non-scraping task."}