Thanh Vinh Vo commited on
Commit
ed781c5
·
1 Parent(s): dd59d44
Files changed (1) hide show
  1. app.py +57 -24
app.py CHANGED
@@ -191,61 +191,94 @@ def get_wikipedia_page_url_by_year(wikipedia_page_name: str, year: int) -> str:
191
 
192
 
193
  @tool
194
- def get_wikipedia_section(
195
  section_name: str, soup_object: BeautifulSoup
196
- ) -> Tag | NavigableString | None:
197
  """
198
- A tool that extracts a specific section from a Wikipedia page using BeautifulSoup.
199
 
200
  This function searches for a section in the following order:
201
  1. First tries to find an element with ID matching the section name
202
  2. If not found, tries to find an h2 element with text matching the section name
203
  3. If not found, tries to find an h3 element with text matching the section name
204
 
 
 
 
205
  Args:
206
- section_name (str): The name of the section to extract
207
  soup_object: A BeautifulSoup object containing the parsed HTML content
208
 
209
  Returns:
210
- Element: The found HTML element, or None if not found
 
211
 
212
  Example:
213
  >>> from bs4 import BeautifulSoup
214
- >>> html = "<html><body><h2>History</h2><p>Some history content</p></body></html>"
215
  >>> soup = BeautifulSoup(html, 'html.parser')
216
- >>> section_element = get_wikipedia_section("History", soup)
217
- >>> print(section_element)
218
  """
 
219
  from bs4 import BeautifulSoup
220
 
221
  if not soup_object:
222
- return None
223
 
224
  # Ensure we have a BeautifulSoup object
225
  if not isinstance(soup_object, BeautifulSoup):
226
- return None
 
 
227
 
228
  # Strategy 1: Try to find element with ID same as section name
229
  # Convert section name to potential ID format (replace spaces with underscores, etc.)
230
  section_id = section_name.replace(" ", "_")
231
  element = soup_object.find(id=section_id)
232
  if element:
233
- return element
234
 
235
  # Strategy 2: Try to find h2 element with text same as section name
236
- h2_elements = soup_object.find_all("h2")
237
- for h2 in h2_elements:
238
- if h2.get_text().strip() == section_name:
239
- return h2
 
 
240
 
241
  # Strategy 3: Try to find h3 element with text same as section name
242
- h3_elements = soup_object.find_all("h3")
243
- for h3 in h3_elements:
244
- if h3.get_text().strip() == section_name:
245
- return h3
246
-
247
- # If no section found, return None
248
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
 
251
  @tool
@@ -316,7 +349,7 @@ class BasicAgent:
316
  audio_to_text,
317
  WikipediaSearchTool(),
318
  get_wikipedia_page_url_by_year,
319
- get_wikipedia_section,
320
  ],
321
  model=OpenAIServerModel(model_id="gpt-4o"),
322
  additional_authorized_imports=[
@@ -356,7 +389,7 @@ class BasicAgent:
356
  get_file,
357
  audio_to_text,
358
  get_wikipedia_page_url_by_year,
359
- get_wikipedia_section,
360
  ],
361
  managed_agents=[self.multimodal_agent],
362
  additional_authorized_imports=[
 
191
 
192
 
193
  @tool
194
+ def get_wikipedia_section_tables(
195
  section_name: str, soup_object: BeautifulSoup
196
+ ) -> list[pd.DataFrame]:
197
  """
198
+ A tool that extracts tables from a specific section of a Wikipedia page using BeautifulSoup and pandas.
199
 
200
  This function searches for a section in the following order:
201
  1. First tries to find an element with ID matching the section name
202
  2. If not found, tries to find an h2 element with text matching the section name
203
  3. If not found, tries to find an h3 element with text matching the section name
204
 
205
+ Once the section is found, it goes to the parent element, finds the next <table> sibling,
206
+ and uses pandas read_html to extract the table data.
207
+
208
  Args:
209
+ section_name (str): The name of the section to extract table from
210
  soup_object: A BeautifulSoup object containing the parsed HTML content
211
 
212
  Returns:
213
+ list: A list of pandas DataFrames representing tables found after the section,
214
+ or empty list if no tables found
215
 
216
  Example:
217
  >>> from bs4 import BeautifulSoup
218
+ >>> html = "<html><body><h2>Statistics</h2><table><tr><td>Data</td></tr></table></body></html>"
219
  >>> soup = BeautifulSoup(html, 'html.parser')
220
+ >>> tables = get_wikipedia_section_table("Statistics", soup)
221
+ >>> print(tables[0] if tables else "No tables found")
222
  """
223
+ import pandas as pd
224
  from bs4 import BeautifulSoup
225
 
226
  if not soup_object:
227
+ return []
228
 
229
  # Ensure we have a BeautifulSoup object
230
  if not isinstance(soup_object, BeautifulSoup):
231
+ return []
232
+
233
+ section_element = None
234
 
235
  # Strategy 1: Try to find element with ID same as section name
236
  # Convert section name to potential ID format (replace spaces with underscores, etc.)
237
  section_id = section_name.replace(" ", "_")
238
  element = soup_object.find(id=section_id)
239
  if element:
240
+ section_element = element
241
 
242
  # Strategy 2: Try to find h2 element with text same as section name
243
+ if not section_element:
244
+ h2_elements = soup_object.find_all("h2")
245
+ for h2 in h2_elements:
246
+ if h2.get_text().strip() == section_name:
247
+ section_element = h2
248
+ break
249
 
250
  # Strategy 3: Try to find h3 element with text same as section name
251
+ if not section_element:
252
+ h3_elements = soup_object.find_all("h3")
253
+ for h3 in h3_elements:
254
+ if h3.get_text().strip() == section_name:
255
+ section_element = h3
256
+ break
257
+
258
+ # If no section found, return empty list
259
+ if not section_element:
260
+ return []
261
+
262
+ # Go to parent element and find next table sibling
263
+ parent = section_element.parent
264
+ if not parent:
265
+ return []
266
+
267
+ # Find the next table sibling from the parent
268
+ table = parent.find_next_sibling("table")
269
+ if not table:
270
+ return []
271
+ try:
272
+ # Use pandas read_html to extract table data
273
+ table_html = str(table)
274
+ tables = pd.read_html(table_html)
275
+ return tables if tables else []
276
+ except ValueError:
277
+ # No tables found or parsing error
278
+ return []
279
+ except Exception:
280
+ # Any other error
281
+ return []
282
 
283
 
284
  @tool
 
349
  audio_to_text,
350
  WikipediaSearchTool(),
351
  get_wikipedia_page_url_by_year,
352
+ get_wikipedia_section_tables,
353
  ],
354
  model=OpenAIServerModel(model_id="gpt-4o"),
355
  additional_authorized_imports=[
 
389
  get_file,
390
  audio_to_text,
391
  get_wikipedia_page_url_by_year,
392
+ get_wikipedia_section_tables,
393
  ],
394
  managed_agents=[self.multimodal_agent],
395
  additional_authorized_imports=[