Spaces:

fmarky
/

youtube-video-summary

Sleeping

App Files Files Community

youtube-video-summary / utils /youtube_transcript.py

fmarky

refactor: move libs to utils

d419635 7 months ago

raw

history blame contribute delete

4.11 kB

	from __future__ import annotations

	import requests
	from bs4 import BeautifulSoup
	from langchain_core.tools import tool
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api._errors import (
	NoTranscriptFound,
	TranscriptsDisabled,
	VideoUnavailable,
	)

	# ---- Config -----------------------------------------------------------------

	DEFAULT_TIMEOUT = 30
	DEFAULT_HEADERS = {
	# Helps avoid consent/anti-bot interstitials on some sites
	"User-Agent": (
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	)
	}


	# ---- Small helpers ----------------------------------------------------------


	def _fetch_html(url: str, *, timeout: int = DEFAULT_TIMEOUT) -> str:
	"""Download raw HTML for a URL or raise on HTTP errors."""
	resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
	resp.raise_for_status()
	return resp.text


	def parse_title_description(raw: str) -> tuple[str, str]:
	title, desc = raw.split("Description:", 1)
	return title.replace("Title:", "").strip(), desc.strip()


	# ---- Pure functions (safe to call directly in Python) -----------------------


	def get_youtube_transcript(video_id: str) -> str:
	"""
	Return YouTube transcript text for a given video ID.
	One line per chunk; raises a clear error if transcript is unavailable.

	Example of video_id:
	For youtube video: https://www.youtube.com/watch?v=1htKBjuUWec
	The video id is: dQw4w9WgXcQ
	"""
	try:
	# Initialize the YouTubeTranscriptApi
	ytt_api = YouTubeTranscriptApi()
	fetched_transcript = ytt_api.fetch(video_id)
	raw_data = fetched_transcript.to_raw_data()
	# raw data is in the form of [{ 'text': 'Hey there', 'start': 0.0, 'duration': 1.54 }, { 'text': 'how are you',, 'start': 1.54, 'duration': 4.16 }, ... ] we will return ony the text element as lines
	transcript = "\n".join([item["text"] for item in raw_data])
	return transcript
	except (TranscriptsDisabled, NoTranscriptFound, VideoUnavailable) as e:
	raise RuntimeError(f"Transcript unavailable: {e}") from e


	def get_youtube_title_description(video_url: str) -> str:
	"""
	get_youtube title and description from youtube url (ex: https://www.youtube.com/watch?v=1htKBjuUWec)

	Extract YouTube title + description from Open Graph meta tags.
	Falls back to standard <meta name="title"/"description"> if needed.
	"""
	html = _fetch_html(video_url)
	soup = BeautifulSoup(html, "html.parser")

	title_tag = soup.find("meta", property="og:title") or soup.find(
	"meta", attrs={"name": "title"}
	)
	desc_tag = soup.find("meta", property="og:description") or soup.find(
	"meta", attrs={"name": "description"}
	)

	title = (title_tag.get("content") if title_tag else None) or "No title found"
	description = (
	desc_tag.get("content") if desc_tag else None
	) or "No description found"

	return f"Title: {title}\nDescription: {description}"


	# ---- LangChain tool wrappers (for agents; call with .invoke) ----------------


	@tool("get_youtube_transcript")
	def get_youtube_transcript_tool(video_id: str) -> str:
	"""Tool: return YouTube transcript text for a video ID."""
	return get_youtube_transcript(video_id)


	@tool("get_youtube_title_description")
	def get_youtube_title_description_tool(video_url: str) -> str:
	"""Tool: return YouTube title + description for a video URL."""
	return get_youtube_title_description(video_url)


	# ---- Minimal demo -----------------------------------------------------------

	if __name__ == "__main__":
	video_id = "1htKBjuUWec"
	url = f"https://www.youtube.com/watch?v={video_id}"

	print(get_youtube_title_description(url))
	try:
	print("\n--- Transcript (first 500 chars) ---")
	tx = get_youtube_transcript(video_id)
	print(tx[:500] + ("..." if len(tx) > 500 else ""))
	except Exception as e:
	print(f"Transcript error: {e}")