algorembrant's picture
Upload 12 files
d2bfe97 verified
"""
fetcher.py
Fetches YouTube transcripts directly via the caption API — no HTML parsing.
Author: algorembrant
"""
from __future__ import annotations
import re
import sys
from typing import Optional
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import (
JSONFormatter,
SRTFormatter,
TextFormatter,
WebVTTFormatter,
)
from youtube_transcript_api._errors import (
NoTranscriptAvailable,
NoTranscriptFound,
TranscriptsDisabled,
VideoUnavailable,
)
from config import DEFAULT_LANGUAGES
# ---------------------------------------------------------------------------
# URL / ID helpers
# ---------------------------------------------------------------------------
_ID_PATTERNS = [
r"(?:youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
r"(?:youtu\.be/)([a-zA-Z0-9_-]{11})",
r"(?:youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
r"(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
]
def extract_video_id(url_or_id: str) -> str:
"""Return the 11-character YouTube video ID from a URL or raw ID."""
for pattern in _ID_PATTERNS:
match = re.search(pattern, url_or_id)
if match:
return match.group(1)
if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id):
return url_or_id
raise ValueError(
f"Cannot extract a valid YouTube video ID from: {url_or_id!r}\n"
"Accepted: full YouTube URL, youtu.be link, Shorts URL, embed URL, or raw 11-char ID."
)
# ---------------------------------------------------------------------------
# Language listing
# ---------------------------------------------------------------------------
def list_available_transcripts(video_id: str) -> None:
"""Print all available transcript languages for a video."""
tlist = YouTubeTranscriptApi.list_transcripts(video_id)
manual = list(tlist._manually_created_transcripts.values())
auto = list(tlist._generated_transcripts.values())
print(f"\nAvailable transcripts -- video: {video_id}\n")
if manual:
print("Manually created:")
for t in manual:
print(f" [{t.language_code:8s}] {t.language}")
if auto:
print("Auto-generated:")
for t in auto:
print(f" [{t.language_code:8s}] {t.language}")
if not manual and not auto:
print(" (none found)")
# ---------------------------------------------------------------------------
# Core fetch
# ---------------------------------------------------------------------------
class TranscriptResult:
"""Container for a fetched transcript."""
def __init__(
self,
video_id: str,
raw_data: list[dict],
language_code: str,
language: str,
is_generated: bool,
) -> None:
self.video_id = video_id
self.raw_data = raw_data # list of {text, start, duration}
self.language_code = language_code
self.language = language
self.is_generated = is_generated
# ------------------------------------------------------------------
# Convenience properties
# ------------------------------------------------------------------
@property
def plain_text(self) -> str:
"""Plain transcript text without timestamps."""
return TextFormatter().format_transcript(self.raw_data)
def timestamped_text(self) -> str:
"""Plain text with [MM:SS.ss] prefixes."""
lines = []
for entry in self.raw_data:
m = int(entry["start"] // 60)
s = entry["start"] % 60
lines.append(f"[{m:02d}:{s:05.2f}] {entry['text']}")
return "\n".join(lines)
def as_json(self) -> str:
return JSONFormatter().format_transcript(self.raw_data, indent=2)
def as_srt(self) -> str:
return SRTFormatter().format_transcript(self.raw_data)
def as_vtt(self) -> str:
return WebVTTFormatter().format_transcript(self.raw_data)
def formatted(self, fmt: str, timestamps: bool = False) -> str:
"""Return transcript in the requested format string."""
if fmt == "json":
return self.as_json()
if fmt == "srt":
return self.as_srt()
if fmt == "vtt":
return self.as_vtt()
# default: text
return self.timestamped_text() if timestamps else self.plain_text
def __len__(self) -> int:
return len(self.plain_text)
def fetch(
video_id: str,
languages: Optional[list[str]] = None,
) -> TranscriptResult:
"""
Fetch a YouTube transcript directly via the caption API.
Args:
video_id: 11-character YouTube video ID.
languages: Ordered list of preferred language codes.
Returns:
TranscriptResult instance.
Raises:
SystemExit on unrecoverable errors (TranscriptsDisabled, VideoUnavailable, etc.)
"""
if languages is None:
languages = DEFAULT_LANGUAGES
try:
tlist = YouTubeTranscriptApi.list_transcripts(video_id)
try:
transcript_obj = tlist.find_transcript(languages)
except NoTranscriptFound:
all_t = (
list(tlist._manually_created_transcripts.values())
+ list(tlist._generated_transcripts.values())
)
if not all_t:
raise NoTranscriptAvailable(video_id)
transcript_obj = all_t[0]
print(
f"[warn] Requested language(s) not found. "
f"Using [{transcript_obj.language_code}] {transcript_obj.language}.",
file=sys.stderr,
)
raw = transcript_obj.fetch()
return TranscriptResult(
video_id=video_id,
raw_data=raw,
language_code=transcript_obj.language_code,
language=transcript_obj.language,
is_generated=transcript_obj.is_generated,
)
except TranscriptsDisabled:
sys.exit(f"[error] Transcripts are disabled for video '{video_id}'.")
except VideoUnavailable:
sys.exit(f"[error] Video '{video_id}' is unavailable (private, deleted, or region-locked).")
except NoTranscriptAvailable:
sys.exit(f"[error] No transcript found for video '{video_id}'.")
except Exception as exc:
sys.exit(f"[error] Unexpected error while fetching transcript: {exc}")