marcos-banik's picture
🚧 get_youtube_transcript
89698c0
from bs4 import BeautifulSoup
import requests
import re
from smolagents import tool
import torch
import spaces
from transformers import pipeline
from pdfminer.high_level import extract_text
import os
from youtube_comment_downloader import (
YoutubeCommentDownloader,
SORT_BY_POPULAR,
)
from youtube_transcript_api import YouTubeTranscriptApi
@tool
def list_wikipedia_sections(page_title: str) -> list[str]:
"""
Return an ordered list of section headings from a Wikipedia article.
Args:
page_title (str): Title of the Wikipedia article,
e.g., "Python (programming language)".
Returns:
list[str]: A list of section titles in the order they appear.
Example: ["History", "Discography", "References", ...].
Returns an empty list if the article is not found or
contains no sections.
"""
url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
resp = requests.get(url, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
sections = []
for div in soup.find_all(
"div", class_=lambda cls: cls and cls.startswith("mw-heading")
):
if div and div.text:
text = div.get_text(strip=True)
if text:
text = re.sub(r"\[edit\]$", "", text).strip()
sections.append(text)
return sections
@tool
def count_tables_in_wikipedia_section(
page_title: str, section_title: str
) -> int:
"""
Return the number of HTML <table> elements found within a specified section of a Wikipedia article.
Args:
page_title (str): Title of the Wikipedia article (spaces will be replaced with underscores),
e.g., "Python (programming language)".
section_title (str): Visible heading of the section to inspect, e.g., "Discography".
Returns:
int: The count of <table> tags under the given section heading, stopping at the next section.
Returns 0 if the article or section is not found or if no tables are present.
"""
url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
resp = requests.get(url, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
div = None
for div in soup.find_all(
"div", class_=lambda cls: cls and cls.startswith("mw-heading")
):
title = div.text.strip()
if title:
title = re.sub(r"\[edit\]$", "", title).strip()
if title.lower() == section_title.lower():
div = div
break
if not div:
return 0
count = 0
for sibling in div.find_next_siblings():
if sibling.name == "table":
count += 1
if (
sibling.name == "div"
and sibling.get("class")
and any(c.startswith("mw-heading") for c in sibling["class"])
):
break
return count
@tool
def extract_nth_table_in_wikipedia_section(
page_title: str, section_title: str, n: int
) -> str:
"""
Extract the HTML of the nth table within a specified section of a Wikipedia article.
Args:
page_title (str):
Title of the Wikipedia article (e.g., "Queen (band)" or "Python (programming language)").
Spaces are automatically replaced with underscores.
section_title (str):
Visible title of the section to search (e.g., "Discography").
n (int):
1-based index specifying which table to extract (1 for the first table, 2 for second, etc.).
Returns:
str: The full HTML string of the requested <table> element, including all nested tags.
Returns an empty string if:
- The article cannot be found.
- The section does not exist.
- The section contains fewer than n tables.
"""
url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
resp = requests.get(url, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
div = None
for div in soup.find_all(
"div", class_=lambda cls: cls and cls.startswith("mw-heading")
):
title = div.text.strip()
if title:
title = re.sub(r"\[edit\]$", "", title).strip()
if title.lower() == section_title.lower():
break
if not div:
return ""
# Iterate siblings until next section header
count = 0
tbl = None
for sib in div.find_next_siblings():
if (
sib.name == "div"
and sib.get("class")
and any(c.startswith("mw-heading") for c in sib["class"])
):
break
if sib.name == "table":
count += 1
if count == n:
tbl = sib
break
# Convert the table to TSV
rows = []
for tr in tbl.find_all("tr"):
cells = tr.find_all(["th", "td"])
texts = [
cell.get_text(separator=" ", strip=True).replace("\t", " ")
for cell in cells
]
if texts:
rows.append("\t".join(texts))
return "\n".join(rows)
@tool
def wikipedia_featured_articles_title(year: int) -> str:
"""
Return the Wikipedia page title listing Featured Articles that were promoted in a given year.
Args:
year (int): The calendar year of interest (e.g., 2021, 2025).
Note:
This tool should be used in conjunction with list_wikipedia_sections
Returns:
str: The Wikipedia page title.
Example: if year = 2021, returns
"Wikipedia:Featured articles promoted in 2021".
"""
return f"Wikipedia:Featured articles promoted in {year}"
device = (
torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)
asr_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-large-v3",
device=device,
chunk_length_s=30,
return_timestamps=False,
generate_kwargs={"task": "transcribe"},
)
@tool
@spaces.GPU
def transcribe_audio_with_whisper(filename: str) -> str:
"""
Transcribe an audio file into text using Whisper.
Args:
filename (str): Path to the audio file on disk (e.g., "input/Strawberry pie.mp3").
Returns:
str: Transcribed text of the audio content.
"""
with open(filename, "rb") as f:
audio_bytes = f.read()
return asr_pipeline(audio_bytes)["text"]
@tool
def extract_page_numbers(text: str) -> str:
"""
Extract all page numbers referenced explicitly after the word 'page' or 'pages' in the text.
Args:
text (str): Input text that may mention "page 1", "pages 10, 20 and 30", etc.
Returns:
str: A comma delimited list of unique page numbers in ascending order.
"""
matches = re.findall(
r"\bpages?\s+([0-9]+(?:\s*,\s*[0-9]+)*(?:\s+and\s+[0-9]+)?)",
text,
flags=re.IGNORECASE,
)
pages = set()
for match in matches:
for num in re.split(r"(?:,|\band\b)", match):
num = num.strip()
if num.isdigit():
pages.add(int(num))
if not pages:
return ""
return ", ".join(str(p) for p in sorted(pages))
@tool
def fetch_raw_html(url: str) -> str:
"""
Return the raw HTML content of the given URL.
Args:
url (str): A fully qualified HTTP/HTTPS URL.
Returns:
str: Raw HTML content of the page, so URLs and links can be extracted via parsing.
"""
resp = requests.get(url, timeout=10)
resp.raise_for_status()
return resp.text
@tool
def extract_links(html: str) -> list[str]:
"""
Parse HTML and return all unique href URLs found.
Args:
html (str): Raw HTML content.
Returns:
list[str]: Unique absolute or relative link URLs found in <a href="..."> tags.
"""
soup = BeautifulSoup(html, "html.parser")
return list({tag["href"] for tag in soup.find_all("a", href=True)})
@tool
def extract_text_from_pdf(pdf_path: str) -> str:
"""
Extract all readable text from a PDF file.
Args:
pdf_path (str): Path to the PDF file (e.g. "input/paper.pdf").
Returns:
str: Complete extracted text from the PDF.
Returns an empty string if extraction fails or file isn't found.
"""
try:
text = extract_text(pdf_path)
return text or ""
except Exception:
return ""
@tool
def fetch_file(url: str, save_path: str) -> str:
"""
Download a file from a URL and save it locally, creating directories if needed.
Args:
url (str): Direct link to the file (e.g., a PDF).
save_path (str): Local file path where content will be saved
(e.g., "input/paper.pdf").
Returns:
str: The local save_path if download and save succeeded;
empty string if an error occurred.
"""
try:
parent_dir = os.path.dirname(save_path)
if parent_dir:
os.makedirs(parent_dir, exist_ok=True)
resp = requests.get(url, timeout=20)
resp.raise_for_status()
with open(save_path, "wb") as f:
f.write(resp.content)
return save_path
except Exception:
return ""
@tool
def normalize_place_name(place: str) -> str:
"""
Convert abbreviated place names like 'St. Petersburg' to 'Saint Petersburg'.
Args:
place (str): City name, possibly abbreviated (e.g., "St. Petersburg").
Returns:
str: Fully written name (e.g., "Saint Petersburg").
"""
if place.startswith("St. "):
return "Saint " + place[4:]
return place
@tool
def is_drink(item_name: str) -> bool:
"""
Return True if the item is a beverage or drink, False otherwise.
Args:
item_name (str): Name of a menu item (e.g., "ice cream", "cola").
Returns:
bool: True if it's a drink, False if it's food.
"""
drinks = ["soda", "cola", "coffee", "tea", "juice", "milkshake", "water"]
return item_name.lower() in drinks
@tool
def download_youtube_comments(url: str, max_comments: int = 100) -> list[str]:
"""
Download up to `max_comments` popular comments from a YouTube video.
Args:
url (str): Full URL of a YouTube video (e.g. "https://www.youtube.com/watch?v=abc123").
max_comments (int): Maximum number of comments to retrieve (default is 100).
Returns:
list[str]: List of plain-text comment strings. Returns an empty list if the video
can't be accessed or no comments are found.
"""
downloader = YoutubeCommentDownloader()
comments = []
for comment in downloader.get_comments_from_url(
url, sort_by=SORT_BY_POPULAR
):
comments.append(comment["text"])
if len(comments) >= max_comments:
break
return comments
@tool
def get_youtube_transcript(video_url: str) -> list[str]:
"""
Extracts the transcript from a YouTube video as a list of text segments.
Args:
video_url (str): Full YouTube video URL (e.g. "https://www.youtube.com/watch?v=abc123xyz").
Returns:
list[str]: List of transcript lines. Empty list if transcript not available.
"""
match = re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", video_url)
if not match:
return []
video_id = match.group(1)
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
return [
entry["text"].strip().capitalize()
for entry in transcript
if entry["text"].strip()
]
except Exception:
return []