NotebookLM / ingestion_engine /transcripter.py
internomega-terrablue
you tube api fix
3c3a632
raw
history blame contribute delete
857 Bytes
"""YouTube video transcript extraction."""
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi
def extract(url: str) -> str:
"""Fetch the transcript for a YouTube video."""
video_id = _parse_video_id(url)
if not video_id:
raise ValueError(f"Could not parse YouTube video ID from: {url}")
ytt = YouTubeTranscriptApi()
transcript = ytt.fetch(video_id)
return " ".join(snippet.text for snippet in transcript)
def _parse_video_id(url: str) -> str | None:
"""Extract video ID from youtube.com/watch?v=... or youtu.be/... URLs."""
parsed = urlparse(url)
hostname = parsed.hostname or ""
if "youtu.be" in hostname:
return parsed.path.lstrip("/")
if "youtube.com" in hostname:
return parse_qs(parsed.query).get("v", [None])[0]
return None