youtube-video-summary / utils /youtube_transcript.py
fmarky's picture
refactor: move libs to utils
d419635
from __future__ import annotations
import requests
from bs4 import BeautifulSoup
from langchain_core.tools import tool
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
NoTranscriptFound,
TranscriptsDisabled,
VideoUnavailable,
)
# ---- Config -----------------------------------------------------------------
DEFAULT_TIMEOUT = 30
DEFAULT_HEADERS = {
# Helps avoid consent/anti-bot interstitials on some sites
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
}
# ---- Small helpers ----------------------------------------------------------
def _fetch_html(url: str, *, timeout: int = DEFAULT_TIMEOUT) -> str:
"""Download raw HTML for a URL or raise on HTTP errors."""
resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
resp.raise_for_status()
return resp.text
def parse_title_description(raw: str) -> tuple[str, str]:
title, desc = raw.split("Description:", 1)
return title.replace("Title:", "").strip(), desc.strip()
# ---- Pure functions (safe to call directly in Python) -----------------------
def get_youtube_transcript(video_id: str) -> str:
"""
Return YouTube transcript text for a given video ID.
One line per chunk; raises a clear error if transcript is unavailable.
Example of video_id:
For youtube video: https://www.youtube.com/watch?v=1htKBjuUWec
The video id is: dQw4w9WgXcQ
"""
try:
# Initialize the YouTubeTranscriptApi
ytt_api = YouTubeTranscriptApi()
fetched_transcript = ytt_api.fetch(video_id)
raw_data = fetched_transcript.to_raw_data()
# raw data is in the form of [{ 'text': 'Hey there', 'start': 0.0, 'duration': 1.54 }, { 'text': 'how are you',, 'start': 1.54, 'duration': 4.16 }, ... ] we will return ony the text element as lines
transcript = "\n".join([item["text"] for item in raw_data])
return transcript
except (TranscriptsDisabled, NoTranscriptFound, VideoUnavailable) as e:
raise RuntimeError(f"Transcript unavailable: {e}") from e
def get_youtube_title_description(video_url: str) -> str:
"""
get_youtube title and description from youtube url (ex: https://www.youtube.com/watch?v=1htKBjuUWec)
Extract YouTube title + description from Open Graph meta tags.
Falls back to standard <meta name="title"/"description"> if needed.
"""
html = _fetch_html(video_url)
soup = BeautifulSoup(html, "html.parser")
title_tag = soup.find("meta", property="og:title") or soup.find(
"meta", attrs={"name": "title"}
)
desc_tag = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
title = (title_tag.get("content") if title_tag else None) or "No title found"
description = (
desc_tag.get("content") if desc_tag else None
) or "No description found"
return f"Title: {title}\nDescription: {description}"
# ---- LangChain tool wrappers (for agents; call with .invoke) ----------------
@tool("get_youtube_transcript")
def get_youtube_transcript_tool(video_id: str) -> str:
"""Tool: return YouTube transcript text for a video ID."""
return get_youtube_transcript(video_id)
@tool("get_youtube_title_description")
def get_youtube_title_description_tool(video_url: str) -> str:
"""Tool: return YouTube title + description for a video URL."""
return get_youtube_title_description(video_url)
# ---- Minimal demo -----------------------------------------------------------
if __name__ == "__main__":
video_id = "1htKBjuUWec"
url = f"https://www.youtube.com/watch?v={video_id}"
print(get_youtube_title_description(url))
try:
print("\n--- Transcript (first 500 chars) ---")
tx = get_youtube_transcript(video_id)
print(tx[:500] + ("..." if len(tx) > 500 else ""))
except Exception as e:
print(f"Transcript error: {e}")