| |
|
| | """
|
| | YouTube Transcript Fetcher
|
| | Fetches transcripts directly from YouTube videos using the YouTube Transcript API.
|
| | No HTML parsing or scraping involved.
|
| | """
|
| |
|
| | import argparse
|
| | import json
|
| | import sys
|
| | import re
|
| | from typing import Optional
|
| |
|
| | from youtube_transcript_api import YouTubeTranscriptApi
|
| | from youtube_transcript_api.formatters import (
|
| | TextFormatter,
|
| | JSONFormatter,
|
| | SRTFormatter,
|
| | WebVTTFormatter,
|
| | )
|
| | from youtube_transcript_api._errors import (
|
| | TranscriptsDisabled,
|
| | NoTranscriptFound,
|
| | VideoUnavailable,
|
| | CouldNotRetrieveTranscript,
|
| | )
|
| |
|
| |
|
| | def extract_video_id(url_or_id: str) -> str:
|
| | """
|
| | Extract the video ID from a YouTube URL or return it directly if already an ID.
|
| |
|
| | Supports formats:
|
| | - https://www.youtube.com/watch?v=VIDEO_ID
|
| | - https://youtu.be/VIDEO_ID
|
| | - https://www.youtube.com/shorts/VIDEO_ID
|
| | - https://www.youtube.com/embed/VIDEO_ID
|
| | - VIDEO_ID (raw)
|
| | """
|
| | patterns = [
|
| | r"(?:youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
|
| | r"(?:youtu\.be/)([a-zA-Z0-9_-]{11})",
|
| | r"(?:youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
|
| | r"(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
|
| | ]
|
| | for pattern in patterns:
|
| | match = re.search(pattern, url_or_id)
|
| | if match:
|
| | return match.group(1)
|
| |
|
| |
|
| | if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id):
|
| | return url_or_id
|
| |
|
| | raise ValueError(
|
| | f"Could not extract a valid YouTube video ID from: {url_or_id}\n"
|
| | "Accepted formats: full URL, youtu.be short link, or raw 11-character video ID."
|
| | )
|
| |
|
| |
|
| | def list_available_transcripts(video_id: str) -> None:
|
| | """List all available transcript languages for a video."""
|
| | api = YouTubeTranscriptApi()
|
| | transcript_list = api.list(video_id)
|
| |
|
| | print(f"\nAvailable transcripts for video: {video_id}\n")
|
| |
|
| | manually_created = list(transcript_list._manually_created_transcripts.values())
|
| | auto_generated = list(transcript_list._generated_transcripts.values())
|
| |
|
| | if manually_created:
|
| | print("Manually created:")
|
| | for t in manually_created:
|
| | print(f" [{t.language_code}] {t.language}")
|
| |
|
| | if auto_generated:
|
| | print("Auto-generated:")
|
| | for t in auto_generated:
|
| | print(f" [{t.language_code}] {t.language} (auto)")
|
| |
|
| | if not manually_created and not auto_generated:
|
| | print(" No transcripts found.")
|
| |
|
| |
|
| | def fetch_transcript(
|
| | video_id: str,
|
| | languages: Optional[list] = None,
|
| | output_format: str = "text",
|
| | preserve_timestamps: bool = False,
|
| | output_file: Optional[str] = None,
|
| | ) -> str:
|
| | """
|
| | Fetch transcript for a given video ID.
|
| |
|
| | Args:
|
| | video_id: YouTube video ID.
|
| | languages: Ordered list of language codes to try (e.g. ['en', 'es']).
|
| | Falls back to the first available transcript if None.
|
| | output_format: One of 'text', 'json', 'srt', 'vtt'.
|
| | preserve_timestamps: Include timestamps in plain-text output.
|
| | output_file: If provided, write transcript to this file path.
|
| |
|
| | Returns:
|
| | The transcript as a formatted string.
|
| | """
|
| | if languages is None:
|
| | languages = ["en"]
|
| |
|
| | try:
|
| | api = YouTubeTranscriptApi()
|
| | transcript_list = api.list(video_id)
|
| |
|
| |
|
| | try:
|
| | transcript = transcript_list.find_transcript(languages)
|
| | except NoTranscriptFound:
|
| |
|
| | all_transcripts = list(transcript_list)
|
| |
|
| | if not all_transcripts:
|
| | print(f"Error: No transcript is available for video '{video_id}'.", file=sys.stderr)
|
| | sys.exit(1)
|
| |
|
| | transcript = all_transcripts[0]
|
| | print(
|
| | f"Warning: None of the requested languages found. "
|
| | f"Using [{transcript.language_code}] {transcript.language} instead.",
|
| | file=sys.stderr,
|
| | )
|
| |
|
| | transcript_data = transcript.fetch()
|
| |
|
| |
|
| | if output_format == "json":
|
| | formatter = JSONFormatter()
|
| | result = formatter.format_transcript(transcript_data, indent=2)
|
| |
|
| | elif output_format == "srt":
|
| | formatter = SRTFormatter()
|
| | result = formatter.format_transcript(transcript_data)
|
| |
|
| | elif output_format == "vtt":
|
| | formatter = WebVTTFormatter()
|
| | result = formatter.format_transcript(transcript_data)
|
| |
|
| | else:
|
| | if preserve_timestamps:
|
| | lines = []
|
| | for entry in transcript_data:
|
| | minutes = int(entry["start"] // 60)
|
| | seconds = entry["start"] % 60
|
| | lines.append(f"[{minutes:02d}:{seconds:05.2f}] {entry['text']}")
|
| | result = "\n".join(lines)
|
| | else:
|
| | formatter = TextFormatter()
|
| | result = formatter.format_transcript(transcript_data)
|
| |
|
| | if output_file:
|
| | with open(output_file, "w", encoding="utf-8") as f:
|
| | f.write(result)
|
| | print(f"Transcript saved to: {output_file}")
|
| |
|
| | return result
|
| |
|
| | except TranscriptsDisabled:
|
| | print(f"Error: Transcripts are disabled for video '{video_id}'.", file=sys.stderr)
|
| | sys.exit(1)
|
| | except VideoUnavailable:
|
| | print(f"Error: Video '{video_id}' is unavailable or does not exist.", file=sys.stderr)
|
| | sys.exit(1)
|
| | except CouldNotRetrieveTranscript as e:
|
| | print(f"Error for video '{video_id}': {e}", file=sys.stderr)
|
| | sys.exit(1)
|
| | except Exception as e:
|
| | print(f"Unexpected error: {e}", file=sys.stderr)
|
| | sys.exit(1)
|
| |
|
| |
|
| | def fetch_multiple(
|
| | video_ids: list,
|
| | languages: Optional[list] = None,
|
| | output_format: str = "text",
|
| | preserve_timestamps: bool = False,
|
| | output_dir: Optional[str] = None,
|
| | ) -> dict:
|
| | """
|
| | Fetch transcripts for multiple video IDs.
|
| |
|
| | Args:
|
| | video_ids: List of YouTube video IDs.
|
| | languages: Language preference list.
|
| | output_format: Output format string.
|
| | preserve_timestamps: Include timestamps.
|
| | output_dir: Directory to save individual transcript files.
|
| |
|
| | Returns:
|
| | Dictionary mapping video_id -> transcript string (or error message).
|
| | """
|
| | import os
|
| |
|
| | results = {}
|
| | for vid in video_ids:
|
| | print(f"Fetching: {vid}", file=sys.stderr)
|
| | try:
|
| | out_file = None
|
| | if output_dir:
|
| | ext_map = {"text": "txt", "json": "json", "srt": "srt", "vtt": "vtt"}
|
| | ext = ext_map.get(output_format, "txt")
|
| | os.makedirs(output_dir, exist_ok=True)
|
| | out_file = os.path.join(output_dir, f"{vid}.{ext}")
|
| |
|
| | transcript = fetch_transcript(
|
| | video_id=vid,
|
| | languages=languages,
|
| | output_format=output_format,
|
| | preserve_timestamps=preserve_timestamps,
|
| | output_file=out_file,
|
| | )
|
| | results[vid] = {"status": "ok", "transcript": transcript}
|
| | except SystemExit:
|
| | results[vid] = {"status": "error", "transcript": None}
|
| |
|
| | return results
|
| |
|
| |
|
| | def parse_args():
|
| | parser = argparse.ArgumentParser(
|
| | description="Fetch YouTube video transcripts directly — no scraping required.",
|
| | formatter_class=argparse.RawTextHelpFormatter,
|
| | )
|
| |
|
| | parser.add_argument(
|
| | "video",
|
| | nargs="+",
|
| | help="YouTube video URL(s) or video ID(s).",
|
| | )
|
| |
|
| | parser.add_argument(
|
| | "-l", "--languages",
|
| | nargs="+",
|
| | default=["en"],
|
| | metavar="LANG",
|
| | help="Language codes in order of preference (default: en).\nExample: --languages en es fr",
|
| | )
|
| |
|
| | parser.add_argument(
|
| | "-f", "--format",
|
| | choices=["text", "json", "srt", "vtt"],
|
| | default="text",
|
| | help="Output format (default: text).",
|
| | )
|
| |
|
| | parser.add_argument(
|
| | "-t", "--timestamps",
|
| | action="store_true",
|
| | help="Include timestamps in plain-text output.",
|
| | )
|
| |
|
| | parser.add_argument(
|
| | "-o", "--output",
|
| | metavar="PATH",
|
| | help="Output file path (single video) or directory (multiple videos).",
|
| | )
|
| |
|
| | parser.add_argument(
|
| | "--list",
|
| | action="store_true",
|
| | help="List all available transcript languages for the video(s) and exit.",
|
| | )
|
| |
|
| | return parser.parse_args()
|
| |
|
| |
|
| | def main():
|
| | args = parse_args()
|
| |
|
| | video_ids = [extract_video_id(v) for v in args.video]
|
| |
|
| | if args.list:
|
| | for vid in video_ids:
|
| | list_available_transcripts(vid)
|
| | return
|
| |
|
| | if len(video_ids) == 1:
|
| | transcript = fetch_transcript(
|
| | video_id=video_ids[0],
|
| | languages=args.languages,
|
| | output_format=args.format,
|
| | preserve_timestamps=args.timestamps,
|
| | output_file=args.output,
|
| | )
|
| | if not args.output:
|
| | print(transcript)
|
| | else:
|
| | results = fetch_multiple(
|
| | video_ids=video_ids,
|
| | languages=args.languages,
|
| | output_format=args.format,
|
| | preserve_timestamps=args.timestamps,
|
| | output_dir=args.output,
|
| | )
|
| | if not args.output:
|
| | for vid, data in results.items():
|
| | print(f"\n{'='*60}")
|
| | print(f"Video ID: {vid}")
|
| | print(f"{'='*60}")
|
| | if data["status"] == "ok":
|
| | print(data["transcript"])
|
| | else:
|
| | print("Failed to retrieve transcript.")
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | main() |