Final_Assignment_Template

Runtime error

File size: 11,794 Bytes

from bs4 import BeautifulSoup
import requests
import re
from smolagents import tool
import torch
import spaces
from transformers import pipeline
from pdfminer.high_level import extract_text
import os
from youtube_comment_downloader import (
    YoutubeCommentDownloader,
    SORT_BY_POPULAR,
)
from youtube_transcript_api import YouTubeTranscriptApi


@tool
def list_wikipedia_sections(page_title: str) -> list[str]:
    """
    Return an ordered list of section headings from a Wikipedia article.

    Args:
        page_title (str): Title of the Wikipedia article,
                          e.g., "Python (programming language)".

    Returns:
        list[str]: A list of section titles in the order they appear.
                   Example: ["History", "Discography", "References", ...].
                   Returns an empty list if the article is not found or
                   contains no sections.
    """
    url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    sections = []

    for div in soup.find_all(
        "div", class_=lambda cls: cls and cls.startswith("mw-heading")
    ):
        if div and div.text:
            text = div.get_text(strip=True)
            if text:
                text = re.sub(r"\[edit\]$", "", text).strip()
                sections.append(text)

    return sections


@tool
def count_tables_in_wikipedia_section(
    page_title: str, section_title: str
) -> int:
    """
    Return the number of HTML <table> elements found within a specified section of a Wikipedia article.

    Args:
        page_title (str): Title of the Wikipedia article (spaces will be replaced with underscores),
                          e.g., "Python (programming language)".
        section_title (str): Visible heading of the section to inspect, e.g., "Discography".

    Returns:
        int: The count of <table> tags under the given section heading, stopping at the next section.
             Returns 0 if the article or section is not found or if no tables are present.
    """
    url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    div = None
    for div in soup.find_all(
        "div", class_=lambda cls: cls and cls.startswith("mw-heading")
    ):
        title = div.text.strip()
        if title:
            title = re.sub(r"\[edit\]$", "", title).strip()
        if title.lower() == section_title.lower():
            div = div
            break

    if not div:
        return 0

    count = 0

    for sibling in div.find_next_siblings():
        if sibling.name == "table":
            count += 1
        if (
            sibling.name == "div"
            and sibling.get("class")
            and any(c.startswith("mw-heading") for c in sibling["class"])
        ):
            break

    return count


@tool
def extract_nth_table_in_wikipedia_section(
    page_title: str, section_title: str, n: int
) -> str:
    """
    Extract the HTML of the nth table within a specified section of a Wikipedia article.

     Args:
         page_title (str):
             Title of the Wikipedia article (e.g., "Queen (band)" or "Python (programming language)").
             Spaces are automatically replaced with underscores.
         section_title (str):
             Visible title of the section to search (e.g., "Discography").
         n (int):
             1-based index specifying which table to extract (1 for the first table, 2 for second, etc.).

     Returns:
         str: The full HTML string of the requested <table> element, including all nested tags.
              Returns an empty string if:
              - The article cannot be found.
              - The section does not exist.
              - The section contains fewer than n tables.
    """
    url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    div = None
    for div in soup.find_all(
        "div", class_=lambda cls: cls and cls.startswith("mw-heading")
    ):
        title = div.text.strip()
        if title:
            title = re.sub(r"\[edit\]$", "", title).strip()
        if title.lower() == section_title.lower():
            break
    if not div:
        return ""

    # Iterate siblings until next section header
    count = 0
    tbl = None
    for sib in div.find_next_siblings():
        if (
            sib.name == "div"
            and sib.get("class")
            and any(c.startswith("mw-heading") for c in sib["class"])
        ):
            break
        if sib.name == "table":
            count += 1
            if count == n:
                tbl = sib
                break

    # Convert the table to TSV
    rows = []
    for tr in tbl.find_all("tr"):
        cells = tr.find_all(["th", "td"])
        texts = [
            cell.get_text(separator=" ", strip=True).replace("\t", " ")
            for cell in cells
        ]
        if texts:
            rows.append("\t".join(texts))

    return "\n".join(rows)


@tool
def wikipedia_featured_articles_title(year: int) -> str:
    """
    Return the Wikipedia page title listing Featured Articles that were promoted in a given year.

    Args:
        year (int): The calendar year of interest (e.g., 2021, 2025).


    Note:
        This tool should be used in conjunction with list_wikipedia_sections

    Returns:
        str: The Wikipedia page title.
             Example: if year = 2021, returns
             "Wikipedia:Featured articles promoted in 2021".
    """
    return f"Wikipedia:Featured articles promoted in {year}"


device = (
    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)

asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v3",
    device=device,
    chunk_length_s=30,
    return_timestamps=False,
    generate_kwargs={"task": "transcribe"},
)


@tool
@spaces.GPU
def transcribe_audio_with_whisper(filename: str) -> str:
    """
    Transcribe an audio file into text using Whisper.

    Args:
        filename (str): Path to the audio file on disk (e.g., "input/Strawberry pie.mp3").

    Returns:
        str: Transcribed text of the audio content.
    """
    with open(filename, "rb") as f:
        audio_bytes = f.read()
    return asr_pipeline(audio_bytes)["text"]


@tool
def extract_page_numbers(text: str) -> str:
    """
    Extract all page numbers referenced explicitly after the word 'page' or 'pages' in the text.

    Args:
        text (str): Input text that may mention "page 1", "pages 10, 20 and 30", etc.

    Returns:
        str: A comma delimited list of unique page numbers in ascending order.
    """
    matches = re.findall(
        r"\bpages?\s+([0-9]+(?:\s*,\s*[0-9]+)*(?:\s+and\s+[0-9]+)?)",
        text,
        flags=re.IGNORECASE,
    )
    pages = set()
    for match in matches:
        for num in re.split(r"(?:,|\band\b)", match):
            num = num.strip()
            if num.isdigit():
                pages.add(int(num))

    if not pages:
        return ""

    return ", ".join(str(p) for p in sorted(pages))


@tool
def fetch_raw_html(url: str) -> str:
    """
    Return the raw HTML content of the given URL.

    Args:
        url (str): A fully qualified HTTP/HTTPS URL.

    Returns:
        str: Raw HTML content of the page, so URLs and links can be extracted via parsing.
    """
    resp = requests.get(url, timeout=10)
    resp.raise_for_status()
    return resp.text


@tool
def extract_links(html: str) -> list[str]:
    """
    Parse HTML and return all unique href URLs found.

    Args:
        html (str): Raw HTML content.

    Returns:
        list[str]: Unique absolute or relative link URLs found in <a href="..."> tags.
    """
    soup = BeautifulSoup(html, "html.parser")
    return list({tag["href"] for tag in soup.find_all("a", href=True)})


@tool
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract all readable text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file (e.g. "input/paper.pdf").

    Returns:
        str: Complete extracted text from the PDF.
             Returns an empty string if extraction fails or file isn't found.
    """
    try:
        text = extract_text(pdf_path)
        return text or ""
    except Exception:
        return ""


@tool
def fetch_file(url: str, save_path: str) -> str:
    """
    Download a file from a URL and save it locally, creating directories if needed.

    Args:
        url (str): Direct link to the file (e.g., a PDF).
        save_path (str): Local file path where content will be saved
                         (e.g., "input/paper.pdf").

    Returns:
        str: The local save_path if download and save succeeded;
             empty string if an error occurred.
    """
    try:
        parent_dir = os.path.dirname(save_path)
        if parent_dir:
            os.makedirs(parent_dir, exist_ok=True)

        resp = requests.get(url, timeout=20)
        resp.raise_for_status()

        with open(save_path, "wb") as f:
            f.write(resp.content)

        return save_path
    except Exception:
        return ""


@tool
def normalize_place_name(place: str) -> str:
    """
    Convert abbreviated place names like 'St. Petersburg' to 'Saint Petersburg'.

    Args:
        place (str): City name, possibly abbreviated (e.g., "St. Petersburg").

    Returns:
        str: Fully written name (e.g., "Saint Petersburg").
    """
    if place.startswith("St. "):
        return "Saint " + place[4:]
    return place


@tool
def is_drink(item_name: str) -> bool:
    """
    Return True if the item is a beverage or drink, False otherwise.

    Args:
        item_name (str): Name of a menu item (e.g., "ice cream", "cola").

    Returns:
        bool: True if it's a drink, False if it's food.
    """
    drinks = ["soda", "cola", "coffee", "tea", "juice", "milkshake", "water"]
    return item_name.lower() in drinks


@tool
def download_youtube_comments(url: str, max_comments: int = 100) -> list[str]:
    """
    Download up to `max_comments` popular comments from a YouTube video.

    Args:
        url (str): Full URL of a YouTube video (e.g. "https://www.youtube.com/watch?v=abc123").
        max_comments (int): Maximum number of comments to retrieve (default is 100).

    Returns:
        list[str]: List of plain-text comment strings. Returns an empty list if the video
                   can't be accessed or no comments are found.
    """
    downloader = YoutubeCommentDownloader()
    comments = []
    for comment in downloader.get_comments_from_url(
        url, sort_by=SORT_BY_POPULAR
    ):
        comments.append(comment["text"])
        if len(comments) >= max_comments:
            break
    return comments


@tool
def get_youtube_transcript(video_url: str) -> list[str]:
    """
    Extracts the transcript from a YouTube video as a list of text segments.

    Args:
        video_url (str): Full YouTube video URL (e.g. "https://www.youtube.com/watch?v=abc123xyz").

    Returns:
        list[str]: List of transcript lines. Empty list if transcript not available.
    """
    match = re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", video_url)
    if not match:
        return []

    video_id = match.group(1)
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return [
            entry["text"].strip().capitalize()
            for entry in transcript
            if entry["text"].strip()
        ]
    except Exception:
        return []