Final_Assignment_Template

Runtime error

App Files Files Community

Final_Assignment_Template / tools.py

marcos-banik

🚧 get_youtube_transcript

89698c0 8 months ago

raw

history blame contribute delete

11.8 kB

	from bs4 import BeautifulSoup
	import requests
	import re
	from smolagents import tool
	import torch
	import spaces
	from transformers import pipeline
	from pdfminer.high_level import extract_text
	import os
	from youtube_comment_downloader import (
	YoutubeCommentDownloader,
	SORT_BY_POPULAR,
	)
	from youtube_transcript_api import YouTubeTranscriptApi


	@tool
	def list_wikipedia_sections(page_title: str) -> list[str]:
	"""
	Return an ordered list of section headings from a Wikipedia article.

	Args:
	page_title (str): Title of the Wikipedia article,
	e.g., "Python (programming language)".

	Returns:
	list[str]: A list of section titles in the order they appear.
	Example: ["History", "Discography", "References", ...].
	Returns an empty list if the article is not found or
	contains no sections.
	"""
	url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
	resp = requests.get(url, timeout=10)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	sections = []

	for div in soup.find_all(
	"div", class_=lambda cls: cls and cls.startswith("mw-heading")
	):
	if div and div.text:
	text = div.get_text(strip=True)
	if text:
	text = re.sub(r"\[edit\]$", "", text).strip()
	sections.append(text)

	return sections


	@tool
	def count_tables_in_wikipedia_section(
	page_title: str, section_title: str
	) -> int:
	"""
	Return the number of HTML <table> elements found within a specified section of a Wikipedia article.

	Args:
	page_title (str): Title of the Wikipedia article (spaces will be replaced with underscores),
	e.g., "Python (programming language)".
	section_title (str): Visible heading of the section to inspect, e.g., "Discography".

	Returns:
	int: The count of <table> tags under the given section heading, stopping at the next section.
	Returns 0 if the article or section is not found or if no tables are present.
	"""
	url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
	resp = requests.get(url, timeout=10)
	resp.raise_for_status()

	soup = BeautifulSoup(resp.text, "html.parser")

	div = None
	for div in soup.find_all(
	"div", class_=lambda cls: cls and cls.startswith("mw-heading")
	):
	title = div.text.strip()
	if title:
	title = re.sub(r"\[edit\]$", "", title).strip()
	if title.lower() == section_title.lower():
	div = div
	break

	if not div:
	return 0

	count = 0

	for sibling in div.find_next_siblings():
	if sibling.name == "table":
	count += 1
	if (
	sibling.name == "div"
	and sibling.get("class")
	and any(c.startswith("mw-heading") for c in sibling["class"])
	):
	break

	return count


	@tool
	def extract_nth_table_in_wikipedia_section(
	page_title: str, section_title: str, n: int
	) -> str:
	"""
	Extract the HTML of the nth table within a specified section of a Wikipedia article.

	Args:
	page_title (str):
	Title of the Wikipedia article (e.g., "Queen (band)" or "Python (programming language)").
	Spaces are automatically replaced with underscores.
	section_title (str):
	Visible title of the section to search (e.g., "Discography").
	n (int):
	1-based index specifying which table to extract (1 for the first table, 2 for second, etc.).

	Returns:
	str: The full HTML string of the requested <table> element, including all nested tags.
	Returns an empty string if:
	- The article cannot be found.
	- The section does not exist.
	- The section contains fewer than n tables.
	"""
	url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
	resp = requests.get(url, timeout=10)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	div = None
	for div in soup.find_all(
	"div", class_=lambda cls: cls and cls.startswith("mw-heading")
	):
	title = div.text.strip()
	if title:
	title = re.sub(r"\[edit\]$", "", title).strip()
	if title.lower() == section_title.lower():
	break
	if not div:
	return ""

	# Iterate siblings until next section header
	count = 0
	tbl = None
	for sib in div.find_next_siblings():
	if (
	sib.name == "div"
	and sib.get("class")
	and any(c.startswith("mw-heading") for c in sib["class"])
	):
	break
	if sib.name == "table":
	count += 1
	if count == n:
	tbl = sib
	break

	# Convert the table to TSV
	rows = []
	for tr in tbl.find_all("tr"):
	cells = tr.find_all(["th", "td"])
	texts = [
	cell.get_text(separator=" ", strip=True).replace("\t", " ")
	for cell in cells
	]
	if texts:
	rows.append("\t".join(texts))

	return "\n".join(rows)


	@tool
	def wikipedia_featured_articles_title(year: int) -> str:
	"""
	Return the Wikipedia page title listing Featured Articles that were promoted in a given year.

	Args:
	year (int): The calendar year of interest (e.g., 2021, 2025).


	Note:
	This tool should be used in conjunction with list_wikipedia_sections

	Returns:
	str: The Wikipedia page title.
	Example: if year = 2021, returns
	"Wikipedia:Featured articles promoted in 2021".
	"""
	return f"Wikipedia:Featured articles promoted in {year}"


	device = (
	torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	)

	asr_pipeline = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-large-v3",
	device=device,
	chunk_length_s=30,
	return_timestamps=False,
	generate_kwargs={"task": "transcribe"},
	)


	@tool
	@spaces.GPU
	def transcribe_audio_with_whisper(filename: str) -> str:
	"""
	Transcribe an audio file into text using Whisper.

	Args:
	filename (str): Path to the audio file on disk (e.g., "input/Strawberry pie.mp3").

	Returns:
	str: Transcribed text of the audio content.
	"""
	with open(filename, "rb") as f:
	audio_bytes = f.read()
	return asr_pipeline(audio_bytes)["text"]


	@tool
	def extract_page_numbers(text: str) -> str:
	"""
	Extract all page numbers referenced explicitly after the word 'page' or 'pages' in the text.

	Args:
	text (str): Input text that may mention "page 1", "pages 10, 20 and 30", etc.

	Returns:
	str: A comma delimited list of unique page numbers in ascending order.
	"""
	matches = re.findall(
	r"\bpages?\s+([0-9]+(?:\s,\s[0-9]+)*(?:\s+and\s+[0-9]+)?)",
	text,
	flags=re.IGNORECASE,
	)
	pages = set()
	for match in matches:
	for num in re.split(r"(?:,\|\band\b)", match):
	num = num.strip()
	if num.isdigit():
	pages.add(int(num))

	if not pages:
	return ""

	return ", ".join(str(p) for p in sorted(pages))


	@tool
	def fetch_raw_html(url: str) -> str:
	"""
	Return the raw HTML content of the given URL.

	Args:
	url (str): A fully qualified HTTP/HTTPS URL.

	Returns:
	str: Raw HTML content of the page, so URLs and links can be extracted via parsing.
	"""
	resp = requests.get(url, timeout=10)
	resp.raise_for_status()
	return resp.text


	@tool
	def extract_links(html: str) -> list[str]:
	"""
	Parse HTML and return all unique href URLs found.

	Args:
	html (str): Raw HTML content.

	Returns:
	list[str]: Unique absolute or relative link URLs found in <a href="..."> tags.
	"""
	soup = BeautifulSoup(html, "html.parser")
	return list({tag["href"] for tag in soup.find_all("a", href=True)})


	@tool
	def extract_text_from_pdf(pdf_path: str) -> str:
	"""
	Extract all readable text from a PDF file.

	Args:
	pdf_path (str): Path to the PDF file (e.g. "input/paper.pdf").

	Returns:
	str: Complete extracted text from the PDF.
	Returns an empty string if extraction fails or file isn't found.
	"""
	try:
	text = extract_text(pdf_path)
	return text or ""
	except Exception:
	return ""


	@tool
	def fetch_file(url: str, save_path: str) -> str:
	"""
	Download a file from a URL and save it locally, creating directories if needed.

	Args:
	url (str): Direct link to the file (e.g., a PDF).
	save_path (str): Local file path where content will be saved
	(e.g., "input/paper.pdf").

	Returns:
	str: The local save_path if download and save succeeded;
	empty string if an error occurred.
	"""
	try:
	parent_dir = os.path.dirname(save_path)
	if parent_dir:
	os.makedirs(parent_dir, exist_ok=True)

	resp = requests.get(url, timeout=20)
	resp.raise_for_status()

	with open(save_path, "wb") as f:
	f.write(resp.content)

	return save_path
	except Exception:
	return ""


	@tool
	def normalize_place_name(place: str) -> str:
	"""
	Convert abbreviated place names like 'St. Petersburg' to 'Saint Petersburg'.

	Args:
	place (str): City name, possibly abbreviated (e.g., "St. Petersburg").

	Returns:
	str: Fully written name (e.g., "Saint Petersburg").
	"""
	if place.startswith("St. "):
	return "Saint " + place[4:]
	return place


	@tool
	def is_drink(item_name: str) -> bool:
	"""
	Return True if the item is a beverage or drink, False otherwise.

	Args:
	item_name (str): Name of a menu item (e.g., "ice cream", "cola").

	Returns:
	bool: True if it's a drink, False if it's food.
	"""
	drinks = ["soda", "cola", "coffee", "tea", "juice", "milkshake", "water"]
	return item_name.lower() in drinks


	@tool
	def download_youtube_comments(url: str, max_comments: int = 100) -> list[str]:
	"""
	Download up to `max_comments` popular comments from a YouTube video.

	Args:
	url (str): Full URL of a YouTube video (e.g. "https://www.youtube.com/watch?v=abc123").
	max_comments (int): Maximum number of comments to retrieve (default is 100).

	Returns:
	list[str]: List of plain-text comment strings. Returns an empty list if the video
	can't be accessed or no comments are found.
	"""
	downloader = YoutubeCommentDownloader()
	comments = []
	for comment in downloader.get_comments_from_url(
	url, sort_by=SORT_BY_POPULAR
	):
	comments.append(comment["text"])
	if len(comments) >= max_comments:
	break
	return comments


	@tool
	def get_youtube_transcript(video_url: str) -> list[str]:
	"""
	Extracts the transcript from a YouTube video as a list of text segments.

	Args:
	video_url (str): Full YouTube video URL (e.g. "https://www.youtube.com/watch?v=abc123xyz").

	Returns:
	list[str]: List of transcript lines. Empty list if transcript not available.
	"""
	match = re.search(r"(?:v=\|youtu\.be/)([a-zA-Z0-9_-]{11})", video_url)
	if not match:
	return []

	video_id = match.group(1)
	try:
	transcript = YouTubeTranscriptApi.get_transcript(video_id)
	return [
	entry["text"].strip().capitalize()
	for entry in transcript
	if entry["text"].strip()
	]
	except Exception:
	return []