Final_Project_Agent_Course

Sleeping

App Files Files Community

Thanh Vinh Vo commited on Jul 10, 2025

Commit

ed781c5

1 Parent(s): dd59d44

update

Browse files

Files changed (1) hide show

app.py +57 -24

app.py CHANGED Viewed

@@ -191,61 +191,94 @@ def get_wikipedia_page_url_by_year(wikipedia_page_name: str, year: int) -> str:
 @tool
-def get_wikipedia_section(
     section_name: str, soup_object: BeautifulSoup
-) -> Tag | NavigableString | None:
     """
-    A tool that extracts a specific section from a Wikipedia page using BeautifulSoup.
     This function searches for a section in the following order:
     1. First tries to find an element with ID matching the section name
     2. If not found, tries to find an h2 element with text matching the section name
     3. If not found, tries to find an h3 element with text matching the section name
     Args:
-        section_name (str): The name of the section to extract
         soup_object: A BeautifulSoup object containing the parsed HTML content
     Returns:
-        Element: The found HTML element, or None if not found
     Example:
         >>> from bs4 import BeautifulSoup
-        >>> html = "<html><body><h2>History</h2><p>Some history content</p></body></html>"
         >>> soup = BeautifulSoup(html, 'html.parser')
-        >>> section_element = get_wikipedia_section("History", soup)
-        >>> print(section_element)
     """
     from bs4 import BeautifulSoup
     if not soup_object:
-        return None
     # Ensure we have a BeautifulSoup object
     if not isinstance(soup_object, BeautifulSoup):
-        return None
     # Strategy 1: Try to find element with ID same as section name
     # Convert section name to potential ID format (replace spaces with underscores, etc.)
     section_id = section_name.replace(" ", "_")
     element = soup_object.find(id=section_id)
     if element:
-        return element
     # Strategy 2: Try to find h2 element with text same as section name
-    h2_elements = soup_object.find_all("h2")
-    for h2 in h2_elements:
-        if h2.get_text().strip() == section_name:
-            return h2
     # Strategy 3: Try to find h3 element with text same as section name
-    h3_elements = soup_object.find_all("h3")
-    for h3 in h3_elements:
-        if h3.get_text().strip() == section_name:
-            return h3
-    # If no section found, return None
-    return None
 @tool
@@ -316,7 +349,7 @@ class BasicAgent:
                 audio_to_text,
                 WikipediaSearchTool(),
                 get_wikipedia_page_url_by_year,
-                get_wikipedia_section,
             ],
             model=OpenAIServerModel(model_id="gpt-4o"),
             additional_authorized_imports=[
@@ -356,7 +389,7 @@ class BasicAgent:
                 get_file,
                 audio_to_text,
                 get_wikipedia_page_url_by_year,
-                get_wikipedia_section,
             ],
             managed_agents=[self.multimodal_agent],
             additional_authorized_imports=[

 @tool
+def get_wikipedia_section_tables(
     section_name: str, soup_object: BeautifulSoup
+) -> list[pd.DataFrame]:
     """
+    A tool that extracts tables from a specific section of a Wikipedia page using BeautifulSoup and pandas.
     This function searches for a section in the following order:
     1. First tries to find an element with ID matching the section name
     2. If not found, tries to find an h2 element with text matching the section name
     3. If not found, tries to find an h3 element with text matching the section name
+    Once the section is found, it goes to the parent element, finds the next <table> sibling,
+    and uses pandas read_html to extract the table data.
     Args:
+        section_name (str): The name of the section to extract table from
         soup_object: A BeautifulSoup object containing the parsed HTML content
     Returns:
+        list: A list of pandas DataFrames representing tables found after the section,
+              or empty list if no tables found
     Example:
         >>> from bs4 import BeautifulSoup
+        >>> html = "<html><body><h2>Statistics</h2><table><tr><td>Data</td></tr></table></body></html>"
         >>> soup = BeautifulSoup(html, 'html.parser')
+        >>> tables = get_wikipedia_section_table("Statistics", soup)
+        >>> print(tables[0] if tables else "No tables found")
     """
+    import pandas as pd
     from bs4 import BeautifulSoup
     if not soup_object:
+        return []
     # Ensure we have a BeautifulSoup object
     if not isinstance(soup_object, BeautifulSoup):
+        return []
+    section_element = None
     # Strategy 1: Try to find element with ID same as section name
     # Convert section name to potential ID format (replace spaces with underscores, etc.)
     section_id = section_name.replace(" ", "_")
     element = soup_object.find(id=section_id)
     if element:
+        section_element = element
     # Strategy 2: Try to find h2 element with text same as section name
+    if not section_element:
+        h2_elements = soup_object.find_all("h2")
+        for h2 in h2_elements:
+            if h2.get_text().strip() == section_name:
+                section_element = h2
+                break
     # Strategy 3: Try to find h3 element with text same as section name
+    if not section_element:
+        h3_elements = soup_object.find_all("h3")
+        for h3 in h3_elements:
+            if h3.get_text().strip() == section_name:
+                section_element = h3
+                break
+    # If no section found, return empty list
+    if not section_element:
+        return []
+    # Go to parent element and find next table sibling
+    parent = section_element.parent
+    if not parent:
+        return []
+    # Find the next table sibling from the parent
+    table = parent.find_next_sibling("table")
+    if not table:
+        return []
+    try:
+        # Use pandas read_html to extract table data
+        table_html = str(table)
+        tables = pd.read_html(table_html)
+        return tables if tables else []
+    except ValueError:
+        # No tables found or parsing error
+        return []
+    except Exception:
+        # Any other error
+        return []
 @tool
                 audio_to_text,
                 WikipediaSearchTool(),
                 get_wikipedia_page_url_by_year,
+                get_wikipedia_section_tables,
             ],
             model=OpenAIServerModel(model_id="gpt-4o"),
             additional_authorized_imports=[
                 get_file,
                 audio_to_text,
                 get_wikipedia_page_url_by_year,
+                get_wikipedia_section_tables,
             ],
             managed_agents=[self.multimodal_agent],
             additional_authorized_imports=[