import os import requests from io import StringIO import pandas as pd from bs4 import BeautifulSoup from smolagents.tools import tool import wikipediaapi def fetch_wikipedia_page(url: str) -> str: """Fetch raw HTML of a Wikipedia page.""" headers = { "User-Agent": "GAIA_benchmark_agent/1.0 (contact: gabriel.melki@gmail.com)", "Accept-Language": "en-US,en;q=0.9", } resp = requests.get(url, headers=headers, timeout=50) resp.raise_for_status() return resp.text def _normalize_title(value: str) -> str: """Lowercase, collapse whitespace for robust title comparisons.""" return " ".join(value.lower().split()) if isinstance(value, str) else "" def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None: """Remove sections (header + content until next header of same/higher level) whose header text matches any of `titles` (case-insensitive). Mutates `soup` in-place. """ if not titles: return excluded = {_normalize_title(t) for t in titles} header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"] # Find all headers that match excluded titles headers_to_remove = [] for header in soup.find_all(header_tags): title_text = _normalize_title(header.get_text(" ", strip=True)) if title_text in excluded: headers_to_remove.append(header) # Remove each matching section (header + content) for header in headers_to_remove: # Skip if header was already removed as part of another section if not header.parent: continue level = int(header.name[1]) # Determine the container to remove - could be the header itself or its parent wrapper header_container = header # If header is wrapped in a heading container (like div.mw-heading), use that as the starting point if (header.parent and header.parent.name == 'div' and header.parent.get('class') and any('heading' in cls.lower() for cls in header.parent.get('class', []))): header_container = header.parent nodes_to_remove = [header_container] # Collect all content after the header container until next header of same/higher level current = header_container while current.next_sibling: current = current.next_sibling sib_name = getattr(current, "name", None) # If we hit another header (directly or within a heading container), check its level next_header = None if sib_name in header_tags: next_header = current elif (sib_name == 'div' and current.get('class') and any('heading' in cls.lower() for cls in current.get('class', []))): # This is a heading container, find the header inside it for child in current.find_all(header_tags): next_header = child break if next_header: next_level = int(next_header.name[1]) if next_level <= level: # This is a header of same or higher level - stop here break # Add this node to removal list nodes_to_remove.append(current) # Remove all collected nodes for node in nodes_to_remove: try: node.decompose() except Exception: try: node.extract() except Exception: pass def _cleanup_non_content(root: BeautifulSoup) -> None: """Remove Wikipedia UI/maintenance blocks from the main content area.""" selectors = [ "div#toc", "div.toc", "div.hatnote", "div.shortdescription", "div.reflist", "ol.references", "div.navbox", "table.navbox", "table.vertical-navbox", "table.sidebar", "table.ambox", "table.metadata", "div#catlinks", "div.mw-authority-control", "div.printfooter", "div.portal", "table.infobox", # avoid dumping infobox into text ] for sel in selectors: for el in root.select(sel): try: el.decompose() except Exception: try: el.extract() except Exception: pass def extract_text(soup: BeautifulSoup) -> str: """Extract main text (paragraphs + headers + lists) from article body only, preserving document order. Excludes content that's inside tables and excludes headers that are also used as table names (either as or the nearest previous header) to avoid duplication with extract_tables.""" content_root = soup.select_one("div.mw-parser-output") or soup for elem in content_root(["script", "style", "sup", "aside", "nav"]): elem.decompose() _cleanup_non_content(content_root) # Identify table names (from captions or nearest previous headers) to avoid duplicating them in text table_names_normalized = set() for table in content_root.find_all("table"): # Skip non-content tables (same logic as extract_tables) classes = table.get("class", []) if isinstance(classes, list) and any( c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"} for c in classes ): continue name_text = None caption_el = table.find("caption") if caption_el: caption_text = caption_el.get_text(" ", strip=True) if caption_text: name_text = caption_text else: # Empty caption: treat as no caption and fallback to previous header prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"]) if prev_header: name_text = prev_header.get_text(" ", strip=True) else: prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"]) if prev_header: name_text = prev_header.get_text(" ", strip=True) if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes): name_text = "Infobox" if name_text: table_names_normalized.add(_normalize_title(name_text)) # Find all text elements in document order, but exclude duplicates text_elements = [] for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]): # Skip elements that are inside a table (to avoid duplication with extract_tables) if element.find_parent("table"): continue # Skip headers that match any table name (to avoid duplication with extract_tables) if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}: header_text_norm = _normalize_title(element.get_text(" ", strip=True)) if header_text_norm in table_names_normalized: continue # Skip list items that are exactly a table name (common for inline mini-TOCs within sections) if element.name == "li": li_text_norm = _normalize_title(element.get_text(" ", strip=True)) if li_text_norm in table_names_normalized: continue text = element.get_text(" ", strip=True) if text: # Only include non-empty text text_elements.append(text) return "\n\n".join(text_elements) def extract_tables(soup: BeautifulSoup) -> list[dict]: """Extract all HTML tables as dicts: {name, df}.""" content_root = soup.select_one("div.mw-parser-output") or soup tables = [] for table_idx, table in enumerate(content_root.find_all("table")): # Skip non-content tables (navboxes, sidebars, etc.) classes = table.get("class", []) if isinstance(classes, list) and any( c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"} for c in classes ): continue # Prefer explicit caption_el = table.find("caption") name = caption_el.get_text(" ", strip=True) if caption_el else None # Fallback: nearest previous section header if not name: prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"]) if prev_header: name = prev_header.get_text(" ", strip=True) # Fallback: class-based hints (e.g., infobox) if not name: if isinstance(classes, list) and any(c.lower() == "infobox" for c in classes): name = "Infobox" # Final fallback if not name: name = f"Table {table_idx + 1}" try: dfs = pd.read_html(StringIO(str(table))) if len(dfs) == 1: tables.append({"name": name, "df": dfs[0]}) else: for part_idx, df in enumerate(dfs, start=1): tables.append({"name": f"{name} (part {part_idx})", "df": df}) except ValueError: continue return tables def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str: """Combine text + tables into a single string for LLM input.""" output = [] output.append("=== ARTICLE TEXT ===\n") output.append(text) excluded = {_normalize_title(s) for s in sections_to_exclude} filtered_tables = [ t for t in tables if _normalize_title(t.get("name", "")) not in excluded ] for i, t in enumerate(filtered_tables, start=1): tname = t.get("name") or f"Table {i}" df = t["df"] output.append(f"\n\n=== TABLE {i}: {tname} ===\n") output.append(df.to_markdown(index=False)) return "\n".join(output) @tool def wikipedia_summary(entity: str) -> dict: """ Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page. Args: entity: the entity being searched for and ALWAYS pass exactly the entity name (person/place/event/concept) with no qualifiers. Returns: A dictionary with the summary of the page and the url of the page. """ import wikipedia summary_tool = wikipediaapi.Wikipedia( user_agent=f"My research agent ({os.getenv('USER_EMAIL')})", ) page = summary_tool.page(entity) if not page.exists(): raise ValueError(f"No Wikipedia page found for '{entity}'. Try a different query.") sections = [section._title for section in page.sections] return { "summary": f'''The sections inside the page are {", ".join(sections)} and the summary of the page is {page.summary} ''', "url": wikipedia.page(pageid=page.pageid).url } @tool def read_wikipedia_page( url: str, sections_to_exclude: list[str] = [ "External links", "References", "Further reading", "See also", "Notes", ]) -> str: """ Read a Wikipedia page and return a string with the text of the page. Args: url: The URL of the Wikipedia page to read. sections_to_exclude: A list of sections to exclude from the page. Returns: A string with the text of the page. """ if "https://en.wikipedia.org/wiki/" not in url: raise ValueError("URL is required") # Fetch the page html = fetch_wikipedia_page(url) # Parse the page soup = BeautifulSoup(html, "html.parser") # Remove unwanted sections _remove_sections_by_titles(soup, sections_to_exclude) # Extract after pruning unwanted sections text = extract_text(soup) tables = extract_tables(soup) # Combine llm_ready = format_for_llm(text, tables, sections_to_exclude) return llm_ready