File size: 6,603 Bytes

d2bfe97

"""

fetcher.py

Fetches YouTube transcripts directly via the caption API — no HTML parsing.

Author: algorembrant

"""

from __future__ import annotations

import re
import sys
from typing import Optional

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import (
    JSONFormatter,
    SRTFormatter,
    TextFormatter,
    WebVTTFormatter,
)
from youtube_transcript_api._errors import (
    NoTranscriptAvailable,
    NoTranscriptFound,
    TranscriptsDisabled,
    VideoUnavailable,
)

from config import DEFAULT_LANGUAGES


# ---------------------------------------------------------------------------
# URL / ID helpers
# ---------------------------------------------------------------------------

_ID_PATTERNS = [
    r"(?:youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
    r"(?:youtu\.be/)([a-zA-Z0-9_-]{11})",
    r"(?:youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
    r"(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
]


def extract_video_id(url_or_id: str) -> str:
    """Return the 11-character YouTube video ID from a URL or raw ID."""
    for pattern in _ID_PATTERNS:
        match = re.search(pattern, url_or_id)
        if match:
            return match.group(1)

    if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id):
        return url_or_id

    raise ValueError(
        f"Cannot extract a valid YouTube video ID from: {url_or_id!r}\n"
        "Accepted: full YouTube URL, youtu.be link, Shorts URL, embed URL, or raw 11-char ID."
    )


# ---------------------------------------------------------------------------
# Language listing
# ---------------------------------------------------------------------------

def list_available_transcripts(video_id: str) -> None:
    """Print all available transcript languages for a video."""
    tlist = YouTubeTranscriptApi.list_transcripts(video_id)

    manual = list(tlist._manually_created_transcripts.values())
    auto   = list(tlist._generated_transcripts.values())

    print(f"\nAvailable transcripts  --  video: {video_id}\n")
    if manual:
        print("Manually created:")
        for t in manual:
            print(f"  [{t.language_code:8s}] {t.language}")
    if auto:
        print("Auto-generated:")
        for t in auto:
            print(f"  [{t.language_code:8s}] {t.language}")
    if not manual and not auto:
        print("  (none found)")


# ---------------------------------------------------------------------------
# Core fetch
# ---------------------------------------------------------------------------

class TranscriptResult:
    """Container for a fetched transcript."""

    def __init__(

        self,

        video_id: str,

        raw_data: list[dict],

        language_code: str,

        language: str,

        is_generated: bool,

    ) -> None:
        self.video_id      = video_id
        self.raw_data      = raw_data          # list of {text, start, duration}
        self.language_code = language_code
        self.language      = language
        self.is_generated  = is_generated

    # ------------------------------------------------------------------
    # Convenience properties
    # ------------------------------------------------------------------

    @property
    def plain_text(self) -> str:
        """Plain transcript text without timestamps."""
        return TextFormatter().format_transcript(self.raw_data)

    def timestamped_text(self) -> str:
        """Plain text with [MM:SS.ss] prefixes."""
        lines = []
        for entry in self.raw_data:
            m = int(entry["start"] // 60)
            s = entry["start"] % 60
            lines.append(f"[{m:02d}:{s:05.2f}] {entry['text']}")
        return "\n".join(lines)

    def as_json(self) -> str:
        return JSONFormatter().format_transcript(self.raw_data, indent=2)

    def as_srt(self) -> str:
        return SRTFormatter().format_transcript(self.raw_data)

    def as_vtt(self) -> str:
        return WebVTTFormatter().format_transcript(self.raw_data)

    def formatted(self, fmt: str, timestamps: bool = False) -> str:
        """Return transcript in the requested format string."""
        if fmt == "json":
            return self.as_json()
        if fmt == "srt":
            return self.as_srt()
        if fmt == "vtt":
            return self.as_vtt()
        # default: text
        return self.timestamped_text() if timestamps else self.plain_text

    def __len__(self) -> int:
        return len(self.plain_text)


def fetch(

    video_id: str,

    languages: Optional[list[str]] = None,

) -> TranscriptResult:
    """

    Fetch a YouTube transcript directly via the caption API.



    Args:

        video_id:  11-character YouTube video ID.

        languages: Ordered list of preferred language codes.



    Returns:

        TranscriptResult instance.



    Raises:

        SystemExit on unrecoverable errors (TranscriptsDisabled, VideoUnavailable, etc.)

    """
    if languages is None:
        languages = DEFAULT_LANGUAGES

    try:
        tlist = YouTubeTranscriptApi.list_transcripts(video_id)

        try:
            transcript_obj = tlist.find_transcript(languages)
        except NoTranscriptFound:
            all_t = (
                list(tlist._manually_created_transcripts.values())
                + list(tlist._generated_transcripts.values())
            )
            if not all_t:
                raise NoTranscriptAvailable(video_id)
            transcript_obj = all_t[0]
            print(
                f"[warn] Requested language(s) not found. "
                f"Using [{transcript_obj.language_code}] {transcript_obj.language}.",
                file=sys.stderr,
            )

        raw = transcript_obj.fetch()
        return TranscriptResult(
            video_id=video_id,
            raw_data=raw,
            language_code=transcript_obj.language_code,
            language=transcript_obj.language,
            is_generated=transcript_obj.is_generated,
        )

    except TranscriptsDisabled:
        sys.exit(f"[error] Transcripts are disabled for video '{video_id}'.")
    except VideoUnavailable:
        sys.exit(f"[error] Video '{video_id}' is unavailable (private, deleted, or region-locked).")
    except NoTranscriptAvailable:
        sys.exit(f"[error] No transcript found for video '{video_id}'.")
    except Exception as exc:
        sys.exit(f"[error] Unexpected error while fetching transcript: {exc}")