Spaces:
Runtime error
Runtime error
| from bs4 import BeautifulSoup | |
| import requests | |
| import re | |
| from smolagents import tool | |
| import torch | |
| import spaces | |
| from transformers import pipeline | |
| from pdfminer.high_level import extract_text | |
| import os | |
| from youtube_comment_downloader import ( | |
| YoutubeCommentDownloader, | |
| SORT_BY_POPULAR, | |
| ) | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| def list_wikipedia_sections(page_title: str) -> list[str]: | |
| """ | |
| Return an ordered list of section headings from a Wikipedia article. | |
| Args: | |
| page_title (str): Title of the Wikipedia article, | |
| e.g., "Python (programming language)". | |
| Returns: | |
| list[str]: A list of section titles in the order they appear. | |
| Example: ["History", "Discography", "References", ...]. | |
| Returns an empty list if the article is not found or | |
| contains no sections. | |
| """ | |
| url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}" | |
| resp = requests.get(url, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| sections = [] | |
| for div in soup.find_all( | |
| "div", class_=lambda cls: cls and cls.startswith("mw-heading") | |
| ): | |
| if div and div.text: | |
| text = div.get_text(strip=True) | |
| if text: | |
| text = re.sub(r"\[edit\]$", "", text).strip() | |
| sections.append(text) | |
| return sections | |
| def count_tables_in_wikipedia_section( | |
| page_title: str, section_title: str | |
| ) -> int: | |
| """ | |
| Return the number of HTML <table> elements found within a specified section of a Wikipedia article. | |
| Args: | |
| page_title (str): Title of the Wikipedia article (spaces will be replaced with underscores), | |
| e.g., "Python (programming language)". | |
| section_title (str): Visible heading of the section to inspect, e.g., "Discography". | |
| Returns: | |
| int: The count of <table> tags under the given section heading, stopping at the next section. | |
| Returns 0 if the article or section is not found or if no tables are present. | |
| """ | |
| url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}" | |
| resp = requests.get(url, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| div = None | |
| for div in soup.find_all( | |
| "div", class_=lambda cls: cls and cls.startswith("mw-heading") | |
| ): | |
| title = div.text.strip() | |
| if title: | |
| title = re.sub(r"\[edit\]$", "", title).strip() | |
| if title.lower() == section_title.lower(): | |
| div = div | |
| break | |
| if not div: | |
| return 0 | |
| count = 0 | |
| for sibling in div.find_next_siblings(): | |
| if sibling.name == "table": | |
| count += 1 | |
| if ( | |
| sibling.name == "div" | |
| and sibling.get("class") | |
| and any(c.startswith("mw-heading") for c in sibling["class"]) | |
| ): | |
| break | |
| return count | |
| def extract_nth_table_in_wikipedia_section( | |
| page_title: str, section_title: str, n: int | |
| ) -> str: | |
| """ | |
| Extract the HTML of the nth table within a specified section of a Wikipedia article. | |
| Args: | |
| page_title (str): | |
| Title of the Wikipedia article (e.g., "Queen (band)" or "Python (programming language)"). | |
| Spaces are automatically replaced with underscores. | |
| section_title (str): | |
| Visible title of the section to search (e.g., "Discography"). | |
| n (int): | |
| 1-based index specifying which table to extract (1 for the first table, 2 for second, etc.). | |
| Returns: | |
| str: The full HTML string of the requested <table> element, including all nested tags. | |
| Returns an empty string if: | |
| - The article cannot be found. | |
| - The section does not exist. | |
| - The section contains fewer than n tables. | |
| """ | |
| url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}" | |
| resp = requests.get(url, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| div = None | |
| for div in soup.find_all( | |
| "div", class_=lambda cls: cls and cls.startswith("mw-heading") | |
| ): | |
| title = div.text.strip() | |
| if title: | |
| title = re.sub(r"\[edit\]$", "", title).strip() | |
| if title.lower() == section_title.lower(): | |
| break | |
| if not div: | |
| return "" | |
| # Iterate siblings until next section header | |
| count = 0 | |
| tbl = None | |
| for sib in div.find_next_siblings(): | |
| if ( | |
| sib.name == "div" | |
| and sib.get("class") | |
| and any(c.startswith("mw-heading") for c in sib["class"]) | |
| ): | |
| break | |
| if sib.name == "table": | |
| count += 1 | |
| if count == n: | |
| tbl = sib | |
| break | |
| # Convert the table to TSV | |
| rows = [] | |
| for tr in tbl.find_all("tr"): | |
| cells = tr.find_all(["th", "td"]) | |
| texts = [ | |
| cell.get_text(separator=" ", strip=True).replace("\t", " ") | |
| for cell in cells | |
| ] | |
| if texts: | |
| rows.append("\t".join(texts)) | |
| return "\n".join(rows) | |
| def wikipedia_featured_articles_title(year: int) -> str: | |
| """ | |
| Return the Wikipedia page title listing Featured Articles that were promoted in a given year. | |
| Args: | |
| year (int): The calendar year of interest (e.g., 2021, 2025). | |
| Note: | |
| This tool should be used in conjunction with list_wikipedia_sections | |
| Returns: | |
| str: The Wikipedia page title. | |
| Example: if year = 2021, returns | |
| "Wikipedia:Featured articles promoted in 2021". | |
| """ | |
| return f"Wikipedia:Featured articles promoted in {year}" | |
| device = ( | |
| torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| ) | |
| asr_pipeline = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-large-v3", | |
| device=device, | |
| chunk_length_s=30, | |
| return_timestamps=False, | |
| generate_kwargs={"task": "transcribe"}, | |
| ) | |
| def transcribe_audio_with_whisper(filename: str) -> str: | |
| """ | |
| Transcribe an audio file into text using Whisper. | |
| Args: | |
| filename (str): Path to the audio file on disk (e.g., "input/Strawberry pie.mp3"). | |
| Returns: | |
| str: Transcribed text of the audio content. | |
| """ | |
| with open(filename, "rb") as f: | |
| audio_bytes = f.read() | |
| return asr_pipeline(audio_bytes)["text"] | |
| def extract_page_numbers(text: str) -> str: | |
| """ | |
| Extract all page numbers referenced explicitly after the word 'page' or 'pages' in the text. | |
| Args: | |
| text (str): Input text that may mention "page 1", "pages 10, 20 and 30", etc. | |
| Returns: | |
| str: A comma delimited list of unique page numbers in ascending order. | |
| """ | |
| matches = re.findall( | |
| r"\bpages?\s+([0-9]+(?:\s*,\s*[0-9]+)*(?:\s+and\s+[0-9]+)?)", | |
| text, | |
| flags=re.IGNORECASE, | |
| ) | |
| pages = set() | |
| for match in matches: | |
| for num in re.split(r"(?:,|\band\b)", match): | |
| num = num.strip() | |
| if num.isdigit(): | |
| pages.add(int(num)) | |
| if not pages: | |
| return "" | |
| return ", ".join(str(p) for p in sorted(pages)) | |
| def fetch_raw_html(url: str) -> str: | |
| """ | |
| Return the raw HTML content of the given URL. | |
| Args: | |
| url (str): A fully qualified HTTP/HTTPS URL. | |
| Returns: | |
| str: Raw HTML content of the page, so URLs and links can be extracted via parsing. | |
| """ | |
| resp = requests.get(url, timeout=10) | |
| resp.raise_for_status() | |
| return resp.text | |
| def extract_links(html: str) -> list[str]: | |
| """ | |
| Parse HTML and return all unique href URLs found. | |
| Args: | |
| html (str): Raw HTML content. | |
| Returns: | |
| list[str]: Unique absolute or relative link URLs found in <a href="..."> tags. | |
| """ | |
| soup = BeautifulSoup(html, "html.parser") | |
| return list({tag["href"] for tag in soup.find_all("a", href=True)}) | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| """ | |
| Extract all readable text from a PDF file. | |
| Args: | |
| pdf_path (str): Path to the PDF file (e.g. "input/paper.pdf"). | |
| Returns: | |
| str: Complete extracted text from the PDF. | |
| Returns an empty string if extraction fails or file isn't found. | |
| """ | |
| try: | |
| text = extract_text(pdf_path) | |
| return text or "" | |
| except Exception: | |
| return "" | |
| def fetch_file(url: str, save_path: str) -> str: | |
| """ | |
| Download a file from a URL and save it locally, creating directories if needed. | |
| Args: | |
| url (str): Direct link to the file (e.g., a PDF). | |
| save_path (str): Local file path where content will be saved | |
| (e.g., "input/paper.pdf"). | |
| Returns: | |
| str: The local save_path if download and save succeeded; | |
| empty string if an error occurred. | |
| """ | |
| try: | |
| parent_dir = os.path.dirname(save_path) | |
| if parent_dir: | |
| os.makedirs(parent_dir, exist_ok=True) | |
| resp = requests.get(url, timeout=20) | |
| resp.raise_for_status() | |
| with open(save_path, "wb") as f: | |
| f.write(resp.content) | |
| return save_path | |
| except Exception: | |
| return "" | |
| def normalize_place_name(place: str) -> str: | |
| """ | |
| Convert abbreviated place names like 'St. Petersburg' to 'Saint Petersburg'. | |
| Args: | |
| place (str): City name, possibly abbreviated (e.g., "St. Petersburg"). | |
| Returns: | |
| str: Fully written name (e.g., "Saint Petersburg"). | |
| """ | |
| if place.startswith("St. "): | |
| return "Saint " + place[4:] | |
| return place | |
| def is_drink(item_name: str) -> bool: | |
| """ | |
| Return True if the item is a beverage or drink, False otherwise. | |
| Args: | |
| item_name (str): Name of a menu item (e.g., "ice cream", "cola"). | |
| Returns: | |
| bool: True if it's a drink, False if it's food. | |
| """ | |
| drinks = ["soda", "cola", "coffee", "tea", "juice", "milkshake", "water"] | |
| return item_name.lower() in drinks | |
| def download_youtube_comments(url: str, max_comments: int = 100) -> list[str]: | |
| """ | |
| Download up to `max_comments` popular comments from a YouTube video. | |
| Args: | |
| url (str): Full URL of a YouTube video (e.g. "https://www.youtube.com/watch?v=abc123"). | |
| max_comments (int): Maximum number of comments to retrieve (default is 100). | |
| Returns: | |
| list[str]: List of plain-text comment strings. Returns an empty list if the video | |
| can't be accessed or no comments are found. | |
| """ | |
| downloader = YoutubeCommentDownloader() | |
| comments = [] | |
| for comment in downloader.get_comments_from_url( | |
| url, sort_by=SORT_BY_POPULAR | |
| ): | |
| comments.append(comment["text"]) | |
| if len(comments) >= max_comments: | |
| break | |
| return comments | |
| def get_youtube_transcript(video_url: str) -> list[str]: | |
| """ | |
| Extracts the transcript from a YouTube video as a list of text segments. | |
| Args: | |
| video_url (str): Full YouTube video URL (e.g. "https://www.youtube.com/watch?v=abc123xyz"). | |
| Returns: | |
| list[str]: List of transcript lines. Empty list if transcript not available. | |
| """ | |
| match = re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", video_url) | |
| if not match: | |
| return [] | |
| video_id = match.group(1) | |
| try: | |
| transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
| return [ | |
| entry["text"].strip().capitalize() | |
| for entry in transcript | |
| if entry["text"].strip() | |
| ] | |
| except Exception: | |
| return [] | |