File size: 10,109 Bytes

297ee7b

#!/usr/bin/env python3
"""

YouTube Transcript Fetcher

Fetches transcripts directly from YouTube videos using the YouTube Transcript API.

No HTML parsing or scraping involved.

"""

import argparse
import json
import sys
import re
from typing import Optional

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import (
    TextFormatter,
    JSONFormatter,
    SRTFormatter,
    WebVTTFormatter,
)
from youtube_transcript_api._errors import (
    TranscriptsDisabled,
    NoTranscriptFound,
    VideoUnavailable,
    CouldNotRetrieveTranscript,
)


def extract_video_id(url_or_id: str) -> str:
    """

    Extract the video ID from a YouTube URL or return it directly if already an ID.



    Supports formats:

        - https://www.youtube.com/watch?v=VIDEO_ID

        - https://youtu.be/VIDEO_ID

        - https://www.youtube.com/shorts/VIDEO_ID

        - https://www.youtube.com/embed/VIDEO_ID

        - VIDEO_ID (raw)

    """
    patterns = [
        r"(?:youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
        r"(?:youtu\.be/)([a-zA-Z0-9_-]{11})",
        r"(?:youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
        r"(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
    ]
    for pattern in patterns:
        match = re.search(pattern, url_or_id)
        if match:
            return match.group(1)

    # Assume raw video ID if it looks like one
    if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id):
        return url_or_id

    raise ValueError(
        f"Could not extract a valid YouTube video ID from: {url_or_id}\n"
        "Accepted formats: full URL, youtu.be short link, or raw 11-character video ID."
    )


def list_available_transcripts(video_id: str) -> None:
    """List all available transcript languages for a video."""
    api = YouTubeTranscriptApi()
    transcript_list = api.list(video_id)

    print(f"\nAvailable transcripts for video: {video_id}\n")

    manually_created = list(transcript_list._manually_created_transcripts.values())
    auto_generated = list(transcript_list._generated_transcripts.values())

    if manually_created:
        print("Manually created:")
        for t in manually_created:
            print(f"  [{t.language_code}] {t.language}")

    if auto_generated:
        print("Auto-generated:")
        for t in auto_generated:
            print(f"  [{t.language_code}] {t.language} (auto)")

    if not manually_created and not auto_generated:
        print("  No transcripts found.")


def fetch_transcript(

    video_id: str,

    languages: Optional[list] = None,

    output_format: str = "text",

    preserve_timestamps: bool = False,

    output_file: Optional[str] = None,

) -> str:
    """

    Fetch transcript for a given video ID.



    Args:

        video_id:            YouTube video ID.

        languages:           Ordered list of language codes to try (e.g. ['en', 'es']).

                             Falls back to the first available transcript if None.

        output_format:       One of 'text', 'json', 'srt', 'vtt'.

        preserve_timestamps: Include timestamps in plain-text output.

        output_file:         If provided, write transcript to this file path.



    Returns:

        The transcript as a formatted string.

    """
    if languages is None:
        languages = ["en"]

    try:
        api = YouTubeTranscriptApi()
        transcript_list = api.list(video_id)
        
        # Try requested languages first; fall back to any available transcript
        try:
            transcript = transcript_list.find_transcript(languages)
        except NoTranscriptFound:
            # Grab whatever is available
            all_transcripts = list(transcript_list)

            if not all_transcripts:
                print(f"Error: No transcript is available for video '{video_id}'.", file=sys.stderr)
                sys.exit(1)

            transcript = all_transcripts[0]
            print(
                f"Warning: None of the requested languages found. "
                f"Using [{transcript.language_code}] {transcript.language} instead.",
                file=sys.stderr,
            )

        transcript_data = transcript.fetch()

        # Format
        if output_format == "json":
            formatter = JSONFormatter()
            result = formatter.format_transcript(transcript_data, indent=2)

        elif output_format == "srt":
            formatter = SRTFormatter()
            result = formatter.format_transcript(transcript_data)

        elif output_format == "vtt":
            formatter = WebVTTFormatter()
            result = formatter.format_transcript(transcript_data)

        else:  # default: plain text
            if preserve_timestamps:
                lines = []
                for entry in transcript_data:
                    minutes = int(entry["start"] // 60)
                    seconds = entry["start"] % 60
                    lines.append(f"[{minutes:02d}:{seconds:05.2f}] {entry['text']}")
                result = "\n".join(lines)
            else:
                formatter = TextFormatter()
                result = formatter.format_transcript(transcript_data)

        if output_file:
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(result)
            print(f"Transcript saved to: {output_file}")

        return result

    except TranscriptsDisabled:
        print(f"Error: Transcripts are disabled for video '{video_id}'.", file=sys.stderr)
        sys.exit(1)
    except VideoUnavailable:
        print(f"Error: Video '{video_id}' is unavailable or does not exist.", file=sys.stderr)
        sys.exit(1)
    except CouldNotRetrieveTranscript as e:
        print(f"Error for video '{video_id}': {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}", file=sys.stderr)
        sys.exit(1)


def fetch_multiple(

    video_ids: list,

    languages: Optional[list] = None,

    output_format: str = "text",

    preserve_timestamps: bool = False,

    output_dir: Optional[str] = None,

) -> dict:
    """

    Fetch transcripts for multiple video IDs.



    Args:

        video_ids:    List of YouTube video IDs.

        languages:    Language preference list.

        output_format: Output format string.

        preserve_timestamps: Include timestamps.

        output_dir:   Directory to save individual transcript files.



    Returns:

        Dictionary mapping video_id -> transcript string (or error message).

    """
    import os

    results = {}
    for vid in video_ids:
        print(f"Fetching: {vid}", file=sys.stderr)
        try:
            out_file = None
            if output_dir:
                ext_map = {"text": "txt", "json": "json", "srt": "srt", "vtt": "vtt"}
                ext = ext_map.get(output_format, "txt")
                os.makedirs(output_dir, exist_ok=True)
                out_file = os.path.join(output_dir, f"{vid}.{ext}")

            transcript = fetch_transcript(
                video_id=vid,
                languages=languages,
                output_format=output_format,
                preserve_timestamps=preserve_timestamps,
                output_file=out_file,
            )
            results[vid] = {"status": "ok", "transcript": transcript}
        except SystemExit:
            results[vid] = {"status": "error", "transcript": None}

    return results


def parse_args():
    parser = argparse.ArgumentParser(
        description="Fetch YouTube video transcripts directly — no scraping required.",
        formatter_class=argparse.RawTextHelpFormatter,
    )

    parser.add_argument(
        "video",
        nargs="+",
        help="YouTube video URL(s) or video ID(s).",
    )

    parser.add_argument(
        "-l", "--languages",
        nargs="+",
        default=["en"],
        metavar="LANG",
        help="Language codes in order of preference (default: en).\nExample: --languages en es fr",
    )

    parser.add_argument(
        "-f", "--format",
        choices=["text", "json", "srt", "vtt"],
        default="text",
        help="Output format (default: text).",
    )

    parser.add_argument(
        "-t", "--timestamps",
        action="store_true",
        help="Include timestamps in plain-text output.",
    )

    parser.add_argument(
        "-o", "--output",
        metavar="PATH",
        help="Output file path (single video) or directory (multiple videos).",
    )

    parser.add_argument(
        "--list",
        action="store_true",
        help="List all available transcript languages for the video(s) and exit.",
    )

    return parser.parse_args()


def main():
    args = parse_args()

    video_ids = [extract_video_id(v) for v in args.video]

    if args.list:
        for vid in video_ids:
            list_available_transcripts(vid)
        return

    if len(video_ids) == 1:
        transcript = fetch_transcript(
            video_id=video_ids[0],
            languages=args.languages,
            output_format=args.format,
            preserve_timestamps=args.timestamps,
            output_file=args.output,
        )
        if not args.output:
            print(transcript)
    else:
        results = fetch_multiple(
            video_ids=video_ids,
            languages=args.languages,
            output_format=args.format,
            preserve_timestamps=args.timestamps,
            output_dir=args.output,
        )
        if not args.output:
            for vid, data in results.items():
                print(f"\n{'='*60}")
                print(f"Video ID: {vid}")
                print(f"{'='*60}")
                if data["status"] == "ok":
                    print(data["transcript"])
                else:
                    print("Failed to retrieve transcript.")


if __name__ == "__main__":
    main()