from bs4 import BeautifulSoup import requests import re from smolagents import tool import torch import spaces from transformers import pipeline from pdfminer.high_level import extract_text import os from youtube_comment_downloader import ( YoutubeCommentDownloader, SORT_BY_POPULAR, ) from youtube_transcript_api import YouTubeTranscriptApi @tool def list_wikipedia_sections(page_title: str) -> list[str]: """ Return an ordered list of section headings from a Wikipedia article. Args: page_title (str): Title of the Wikipedia article, e.g., "Python (programming language)". Returns: list[str]: A list of section titles in the order they appear. Example: ["History", "Discography", "References", ...]. Returns an empty list if the article is not found or contains no sections. """ url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}" resp = requests.get(url, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") sections = [] for div in soup.find_all( "div", class_=lambda cls: cls and cls.startswith("mw-heading") ): if div and div.text: text = div.get_text(strip=True) if text: text = re.sub(r"\[edit\]$", "", text).strip() sections.append(text) return sections @tool def count_tables_in_wikipedia_section( page_title: str, section_title: str ) -> int: """ Return the number of HTML elements found within a specified section of a Wikipedia article. Args: page_title (str): Title of the Wikipedia article (spaces will be replaced with underscores), e.g., "Python (programming language)". section_title (str): Visible heading of the section to inspect, e.g., "Discography". Returns: int: The count of
tags under the given section heading, stopping at the next section. Returns 0 if the article or section is not found or if no tables are present. """ url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}" resp = requests.get(url, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") div = None for div in soup.find_all( "div", class_=lambda cls: cls and cls.startswith("mw-heading") ): title = div.text.strip() if title: title = re.sub(r"\[edit\]$", "", title).strip() if title.lower() == section_title.lower(): div = div break if not div: return 0 count = 0 for sibling in div.find_next_siblings(): if sibling.name == "table": count += 1 if ( sibling.name == "div" and sibling.get("class") and any(c.startswith("mw-heading") for c in sibling["class"]) ): break return count @tool def extract_nth_table_in_wikipedia_section( page_title: str, section_title: str, n: int ) -> str: """ Extract the HTML of the nth table within a specified section of a Wikipedia article. Args: page_title (str): Title of the Wikipedia article (e.g., "Queen (band)" or "Python (programming language)"). Spaces are automatically replaced with underscores. section_title (str): Visible title of the section to search (e.g., "Discography"). n (int): 1-based index specifying which table to extract (1 for the first table, 2 for second, etc.). Returns: str: The full HTML string of the requested
element, including all nested tags. Returns an empty string if: - The article cannot be found. - The section does not exist. - The section contains fewer than n tables. """ url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}" resp = requests.get(url, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") div = None for div in soup.find_all( "div", class_=lambda cls: cls and cls.startswith("mw-heading") ): title = div.text.strip() if title: title = re.sub(r"\[edit\]$", "", title).strip() if title.lower() == section_title.lower(): break if not div: return "" # Iterate siblings until next section header count = 0 tbl = None for sib in div.find_next_siblings(): if ( sib.name == "div" and sib.get("class") and any(c.startswith("mw-heading") for c in sib["class"]) ): break if sib.name == "table": count += 1 if count == n: tbl = sib break # Convert the table to TSV rows = [] for tr in tbl.find_all("tr"): cells = tr.find_all(["th", "td"]) texts = [ cell.get_text(separator=" ", strip=True).replace("\t", " ") for cell in cells ] if texts: rows.append("\t".join(texts)) return "\n".join(rows) @tool def wikipedia_featured_articles_title(year: int) -> str: """ Return the Wikipedia page title listing Featured Articles that were promoted in a given year. Args: year (int): The calendar year of interest (e.g., 2021, 2025). Note: This tool should be used in conjunction with list_wikipedia_sections Returns: str: The Wikipedia page title. Example: if year = 2021, returns "Wikipedia:Featured articles promoted in 2021". """ return f"Wikipedia:Featured articles promoted in {year}" device = ( torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") ) asr_pipeline = pipeline( "automatic-speech-recognition", model="openai/whisper-large-v3", device=device, chunk_length_s=30, return_timestamps=False, generate_kwargs={"task": "transcribe"}, ) @tool @spaces.GPU def transcribe_audio_with_whisper(filename: str) -> str: """ Transcribe an audio file into text using Whisper. Args: filename (str): Path to the audio file on disk (e.g., "input/Strawberry pie.mp3"). Returns: str: Transcribed text of the audio content. """ with open(filename, "rb") as f: audio_bytes = f.read() return asr_pipeline(audio_bytes)["text"] @tool def extract_page_numbers(text: str) -> str: """ Extract all page numbers referenced explicitly after the word 'page' or 'pages' in the text. Args: text (str): Input text that may mention "page 1", "pages 10, 20 and 30", etc. Returns: str: A comma delimited list of unique page numbers in ascending order. """ matches = re.findall( r"\bpages?\s+([0-9]+(?:\s*,\s*[0-9]+)*(?:\s+and\s+[0-9]+)?)", text, flags=re.IGNORECASE, ) pages = set() for match in matches: for num in re.split(r"(?:,|\band\b)", match): num = num.strip() if num.isdigit(): pages.add(int(num)) if not pages: return "" return ", ".join(str(p) for p in sorted(pages)) @tool def fetch_raw_html(url: str) -> str: """ Return the raw HTML content of the given URL. Args: url (str): A fully qualified HTTP/HTTPS URL. Returns: str: Raw HTML content of the page, so URLs and links can be extracted via parsing. """ resp = requests.get(url, timeout=10) resp.raise_for_status() return resp.text @tool def extract_links(html: str) -> list[str]: """ Parse HTML and return all unique href URLs found. Args: html (str): Raw HTML content. Returns: list[str]: Unique absolute or relative link URLs found in tags. """ soup = BeautifulSoup(html, "html.parser") return list({tag["href"] for tag in soup.find_all("a", href=True)}) @tool def extract_text_from_pdf(pdf_path: str) -> str: """ Extract all readable text from a PDF file. Args: pdf_path (str): Path to the PDF file (e.g. "input/paper.pdf"). Returns: str: Complete extracted text from the PDF. Returns an empty string if extraction fails or file isn't found. """ try: text = extract_text(pdf_path) return text or "" except Exception: return "" @tool def fetch_file(url: str, save_path: str) -> str: """ Download a file from a URL and save it locally, creating directories if needed. Args: url (str): Direct link to the file (e.g., a PDF). save_path (str): Local file path where content will be saved (e.g., "input/paper.pdf"). Returns: str: The local save_path if download and save succeeded; empty string if an error occurred. """ try: parent_dir = os.path.dirname(save_path) if parent_dir: os.makedirs(parent_dir, exist_ok=True) resp = requests.get(url, timeout=20) resp.raise_for_status() with open(save_path, "wb") as f: f.write(resp.content) return save_path except Exception: return "" @tool def normalize_place_name(place: str) -> str: """ Convert abbreviated place names like 'St. Petersburg' to 'Saint Petersburg'. Args: place (str): City name, possibly abbreviated (e.g., "St. Petersburg"). Returns: str: Fully written name (e.g., "Saint Petersburg"). """ if place.startswith("St. "): return "Saint " + place[4:] return place @tool def is_drink(item_name: str) -> bool: """ Return True if the item is a beverage or drink, False otherwise. Args: item_name (str): Name of a menu item (e.g., "ice cream", "cola"). Returns: bool: True if it's a drink, False if it's food. """ drinks = ["soda", "cola", "coffee", "tea", "juice", "milkshake", "water"] return item_name.lower() in drinks @tool def download_youtube_comments(url: str, max_comments: int = 100) -> list[str]: """ Download up to `max_comments` popular comments from a YouTube video. Args: url (str): Full URL of a YouTube video (e.g. "https://www.youtube.com/watch?v=abc123"). max_comments (int): Maximum number of comments to retrieve (default is 100). Returns: list[str]: List of plain-text comment strings. Returns an empty list if the video can't be accessed or no comments are found. """ downloader = YoutubeCommentDownloader() comments = [] for comment in downloader.get_comments_from_url( url, sort_by=SORT_BY_POPULAR ): comments.append(comment["text"]) if len(comments) >= max_comments: break return comments @tool def get_youtube_transcript(video_url: str) -> list[str]: """ Extracts the transcript from a YouTube video as a list of text segments. Args: video_url (str): Full YouTube video URL (e.g. "https://www.youtube.com/watch?v=abc123xyz"). Returns: list[str]: List of transcript lines. Empty list if transcript not available. """ match = re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", video_url) if not match: return [] video_id = match.group(1) try: transcript = YouTubeTranscriptApi.get_transcript(video_id) return [ entry["text"].strip().capitalize() for entry in transcript if entry["text"].strip() ] except Exception: return []