Spaces:
Sleeping
Sleeping
| from smolagents import Tool | |
| import requests | |
| from bs4 import BeautifulSoup, Tag | |
| class WikipediaParser(Tool): | |
| name: str = "wikipedia_parser_tool" | |
| description: str = ( | |
| "This tool parse a Wikipedia page into a clean, readable text format." | |
| ) | |
| inputs: dict[str, dict[str, str]] = { | |
| "url": { | |
| "type": "string", | |
| "description": "The Wikipedia page url.", | |
| } | |
| } | |
| output_type: str = "string" | |
| def get_wikipedia_page(self, url: str) -> str: | |
| """ | |
| Fetches the content of a Wikipedia page given its URL. | |
| Args: | |
| url (str): The URL of the Wikipedia page. | |
| Returns: | |
| str: The HTML content of the page. | |
| """ | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" | |
| } | |
| resp = requests.get(url, headers=headers, timeout=30) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| content_div = soup.find("div", id="mw-content-text") | |
| if not content_div: | |
| return "Content not found." | |
| elements: list[str] = [] | |
| h_tags: list[str] = [f"h{i}" for i in range(1, 6)] | |
| extra_tags: list[str] = ["p", "ul", "ol"] | |
| html_tags: list[str] = h_tags + extra_tags | |
| for elem in content_div.find_all(html_tags): | |
| if elem.name in h_tags: | |
| elements.append("\n\n" + elem.get_text(strip=True) + "\n") | |
| elif elem.name in extra_tags: | |
| elements.append(elem.get_text(strip=True)) | |
| elif elem.name == "table": | |
| elements.append(self.parse_wikipedia_table(elem)) | |
| return "\n\n".join(elements) | |
| def parse_wikipedia_table(table: Tag) -> str: | |
| """ | |
| Parses a Wikipedia table into a clean, readable text format. | |
| Args: | |
| table (Tag): BeautifulSoup Tag for the table. | |
| Returns: | |
| str: Formatted table as readable text. | |
| """ | |
| rows = [] | |
| headers = [] | |
| # Try to get headers | |
| thead = table.find("thead") | |
| if thead: | |
| for th in thead.find_all("th"): | |
| header_text = th.get_text(separator=" ", strip=True) | |
| headers.append(header_text) | |
| if headers: | |
| rows.append(" | ".join(headers)) | |
| # Parse table body rows | |
| tbody = table.find("tbody") | |
| if not tbody: | |
| tbody = table # fallback: some tables have no tbody explicitly | |
| for tr in tbody.find_all("tr"): | |
| cells = tr.find_all(["th", "td"]) | |
| cell_texts = [] | |
| for cell in cells: | |
| # Clean references like [7], [note 1], etc. | |
| for sup in cell.find_all("sup", class_="reference"): | |
| sup.decompose() | |
| text = cell.get_text(separator=" ", strip=True) | |
| cell_texts.append(text) | |
| if cell_texts: | |
| row_text = " | ".join(cell_texts) | |
| rows.append(row_text) | |
| return "\n".join(rows) | |
| def forward(self, url: str) -> str: | |
| """ | |
| Parses the Wikipedia page and returns the content as a string. | |
| Args: | |
| url (str): The URL of the Wikipedia page. | |
| Returns: | |
| str: The parsed content of the page. | |
| """ | |
| html_string = self.get_wikipedia_page(url) | |
| return html_string | |