| | """
|
| | fetcher.py
|
| | Fetches YouTube transcripts directly via the caption API — no HTML parsing.
|
| | Author: algorembrant
|
| | """
|
| |
|
| | from __future__ import annotations
|
| |
|
| | import re
|
| | import sys
|
| | from typing import Optional
|
| |
|
| | from youtube_transcript_api import YouTubeTranscriptApi
|
| | from youtube_transcript_api.formatters import (
|
| | JSONFormatter,
|
| | SRTFormatter,
|
| | TextFormatter,
|
| | WebVTTFormatter,
|
| | )
|
| | from youtube_transcript_api._errors import (
|
| | NoTranscriptAvailable,
|
| | NoTranscriptFound,
|
| | TranscriptsDisabled,
|
| | VideoUnavailable,
|
| | )
|
| |
|
| | from config import DEFAULT_LANGUAGES
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | _ID_PATTERNS = [
|
| | r"(?:youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
|
| | r"(?:youtu\.be/)([a-zA-Z0-9_-]{11})",
|
| | r"(?:youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
|
| | r"(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
|
| | ]
|
| |
|
| |
|
| | def extract_video_id(url_or_id: str) -> str:
|
| | """Return the 11-character YouTube video ID from a URL or raw ID."""
|
| | for pattern in _ID_PATTERNS:
|
| | match = re.search(pattern, url_or_id)
|
| | if match:
|
| | return match.group(1)
|
| |
|
| | if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id):
|
| | return url_or_id
|
| |
|
| | raise ValueError(
|
| | f"Cannot extract a valid YouTube video ID from: {url_or_id!r}\n"
|
| | "Accepted: full YouTube URL, youtu.be link, Shorts URL, embed URL, or raw 11-char ID."
|
| | )
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def list_available_transcripts(video_id: str) -> None:
|
| | """Print all available transcript languages for a video."""
|
| | tlist = YouTubeTranscriptApi.list_transcripts(video_id)
|
| |
|
| | manual = list(tlist._manually_created_transcripts.values())
|
| | auto = list(tlist._generated_transcripts.values())
|
| |
|
| | print(f"\nAvailable transcripts -- video: {video_id}\n")
|
| | if manual:
|
| | print("Manually created:")
|
| | for t in manual:
|
| | print(f" [{t.language_code:8s}] {t.language}")
|
| | if auto:
|
| | print("Auto-generated:")
|
| | for t in auto:
|
| | print(f" [{t.language_code:8s}] {t.language}")
|
| | if not manual and not auto:
|
| | print(" (none found)")
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class TranscriptResult:
|
| | """Container for a fetched transcript."""
|
| |
|
| | def __init__(
|
| | self,
|
| | video_id: str,
|
| | raw_data: list[dict],
|
| | language_code: str,
|
| | language: str,
|
| | is_generated: bool,
|
| | ) -> None:
|
| | self.video_id = video_id
|
| | self.raw_data = raw_data
|
| | self.language_code = language_code
|
| | self.language = language
|
| | self.is_generated = is_generated
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | @property
|
| | def plain_text(self) -> str:
|
| | """Plain transcript text without timestamps."""
|
| | return TextFormatter().format_transcript(self.raw_data)
|
| |
|
| | def timestamped_text(self) -> str:
|
| | """Plain text with [MM:SS.ss] prefixes."""
|
| | lines = []
|
| | for entry in self.raw_data:
|
| | m = int(entry["start"] // 60)
|
| | s = entry["start"] % 60
|
| | lines.append(f"[{m:02d}:{s:05.2f}] {entry['text']}")
|
| | return "\n".join(lines)
|
| |
|
| | def as_json(self) -> str:
|
| | return JSONFormatter().format_transcript(self.raw_data, indent=2)
|
| |
|
| | def as_srt(self) -> str:
|
| | return SRTFormatter().format_transcript(self.raw_data)
|
| |
|
| | def as_vtt(self) -> str:
|
| | return WebVTTFormatter().format_transcript(self.raw_data)
|
| |
|
| | def formatted(self, fmt: str, timestamps: bool = False) -> str:
|
| | """Return transcript in the requested format string."""
|
| | if fmt == "json":
|
| | return self.as_json()
|
| | if fmt == "srt":
|
| | return self.as_srt()
|
| | if fmt == "vtt":
|
| | return self.as_vtt()
|
| |
|
| | return self.timestamped_text() if timestamps else self.plain_text
|
| |
|
| | def __len__(self) -> int:
|
| | return len(self.plain_text)
|
| |
|
| |
|
| | def fetch(
|
| | video_id: str,
|
| | languages: Optional[list[str]] = None,
|
| | ) -> TranscriptResult:
|
| | """
|
| | Fetch a YouTube transcript directly via the caption API.
|
| |
|
| | Args:
|
| | video_id: 11-character YouTube video ID.
|
| | languages: Ordered list of preferred language codes.
|
| |
|
| | Returns:
|
| | TranscriptResult instance.
|
| |
|
| | Raises:
|
| | SystemExit on unrecoverable errors (TranscriptsDisabled, VideoUnavailable, etc.)
|
| | """
|
| | if languages is None:
|
| | languages = DEFAULT_LANGUAGES
|
| |
|
| | try:
|
| | tlist = YouTubeTranscriptApi.list_transcripts(video_id)
|
| |
|
| | try:
|
| | transcript_obj = tlist.find_transcript(languages)
|
| | except NoTranscriptFound:
|
| | all_t = (
|
| | list(tlist._manually_created_transcripts.values())
|
| | + list(tlist._generated_transcripts.values())
|
| | )
|
| | if not all_t:
|
| | raise NoTranscriptAvailable(video_id)
|
| | transcript_obj = all_t[0]
|
| | print(
|
| | f"[warn] Requested language(s) not found. "
|
| | f"Using [{transcript_obj.language_code}] {transcript_obj.language}.",
|
| | file=sys.stderr,
|
| | )
|
| |
|
| | raw = transcript_obj.fetch()
|
| | return TranscriptResult(
|
| | video_id=video_id,
|
| | raw_data=raw,
|
| | language_code=transcript_obj.language_code,
|
| | language=transcript_obj.language,
|
| | is_generated=transcript_obj.is_generated,
|
| | )
|
| |
|
| | except TranscriptsDisabled:
|
| | sys.exit(f"[error] Transcripts are disabled for video '{video_id}'.")
|
| | except VideoUnavailable:
|
| | sys.exit(f"[error] Video '{video_id}' is unavailable (private, deleted, or region-locked).")
|
| | except NoTranscriptAvailable:
|
| | sys.exit(f"[error] No transcript found for video '{video_id}'.")
|
| | except Exception as exc:
|
| | sys.exit(f"[error] Unexpected error while fetching transcript: {exc}") |