#!/usr/bin/env python3 """ YouTube Transcript Fetcher Fetches transcripts directly from YouTube videos using the YouTube Transcript API. No HTML parsing or scraping involved. """ import argparse import json import sys import re from typing import Optional from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api.formatters import ( TextFormatter, JSONFormatter, SRTFormatter, WebVTTFormatter, ) from youtube_transcript_api._errors import ( TranscriptsDisabled, NoTranscriptFound, VideoUnavailable, CouldNotRetrieveTranscript, ) def extract_video_id(url_or_id: str) -> str: """ Extract the video ID from a YouTube URL or return it directly if already an ID. Supports formats: - https://www.youtube.com/watch?v=VIDEO_ID - https://youtu.be/VIDEO_ID - https://www.youtube.com/shorts/VIDEO_ID - https://www.youtube.com/embed/VIDEO_ID - VIDEO_ID (raw) """ patterns = [ r"(?:youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})", r"(?:youtu\.be/)([a-zA-Z0-9_-]{11})", r"(?:youtube\.com/shorts/)([a-zA-Z0-9_-]{11})", r"(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})", ] for pattern in patterns: match = re.search(pattern, url_or_id) if match: return match.group(1) # Assume raw video ID if it looks like one if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id): return url_or_id raise ValueError( f"Could not extract a valid YouTube video ID from: {url_or_id}\n" "Accepted formats: full URL, youtu.be short link, or raw 11-character video ID." ) def list_available_transcripts(video_id: str) -> None: """List all available transcript languages for a video.""" api = YouTubeTranscriptApi() transcript_list = api.list(video_id) print(f"\nAvailable transcripts for video: {video_id}\n") manually_created = list(transcript_list._manually_created_transcripts.values()) auto_generated = list(transcript_list._generated_transcripts.values()) if manually_created: print("Manually created:") for t in manually_created: print(f" [{t.language_code}] {t.language}") if auto_generated: print("Auto-generated:") for t in auto_generated: print(f" [{t.language_code}] {t.language} (auto)") if not manually_created and not auto_generated: print(" No transcripts found.") def fetch_transcript( video_id: str, languages: Optional[list] = None, output_format: str = "text", preserve_timestamps: bool = False, output_file: Optional[str] = None, ) -> str: """ Fetch transcript for a given video ID. Args: video_id: YouTube video ID. languages: Ordered list of language codes to try (e.g. ['en', 'es']). Falls back to the first available transcript if None. output_format: One of 'text', 'json', 'srt', 'vtt'. preserve_timestamps: Include timestamps in plain-text output. output_file: If provided, write transcript to this file path. Returns: The transcript as a formatted string. """ if languages is None: languages = ["en"] try: api = YouTubeTranscriptApi() transcript_list = api.list(video_id) # Try requested languages first; fall back to any available transcript try: transcript = transcript_list.find_transcript(languages) except NoTranscriptFound: # Grab whatever is available all_transcripts = list(transcript_list) if not all_transcripts: print(f"Error: No transcript is available for video '{video_id}'.", file=sys.stderr) sys.exit(1) transcript = all_transcripts[0] print( f"Warning: None of the requested languages found. " f"Using [{transcript.language_code}] {transcript.language} instead.", file=sys.stderr, ) transcript_data = transcript.fetch() # Format if output_format == "json": formatter = JSONFormatter() result = formatter.format_transcript(transcript_data, indent=2) elif output_format == "srt": formatter = SRTFormatter() result = formatter.format_transcript(transcript_data) elif output_format == "vtt": formatter = WebVTTFormatter() result = formatter.format_transcript(transcript_data) else: # default: plain text if preserve_timestamps: lines = [] for entry in transcript_data: minutes = int(entry["start"] // 60) seconds = entry["start"] % 60 lines.append(f"[{minutes:02d}:{seconds:05.2f}] {entry['text']}") result = "\n".join(lines) else: formatter = TextFormatter() result = formatter.format_transcript(transcript_data) if output_file: with open(output_file, "w", encoding="utf-8") as f: f.write(result) print(f"Transcript saved to: {output_file}") return result except TranscriptsDisabled: print(f"Error: Transcripts are disabled for video '{video_id}'.", file=sys.stderr) sys.exit(1) except VideoUnavailable: print(f"Error: Video '{video_id}' is unavailable or does not exist.", file=sys.stderr) sys.exit(1) except CouldNotRetrieveTranscript as e: print(f"Error for video '{video_id}': {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Unexpected error: {e}", file=sys.stderr) sys.exit(1) def fetch_multiple( video_ids: list, languages: Optional[list] = None, output_format: str = "text", preserve_timestamps: bool = False, output_dir: Optional[str] = None, ) -> dict: """ Fetch transcripts for multiple video IDs. Args: video_ids: List of YouTube video IDs. languages: Language preference list. output_format: Output format string. preserve_timestamps: Include timestamps. output_dir: Directory to save individual transcript files. Returns: Dictionary mapping video_id -> transcript string (or error message). """ import os results = {} for vid in video_ids: print(f"Fetching: {vid}", file=sys.stderr) try: out_file = None if output_dir: ext_map = {"text": "txt", "json": "json", "srt": "srt", "vtt": "vtt"} ext = ext_map.get(output_format, "txt") os.makedirs(output_dir, exist_ok=True) out_file = os.path.join(output_dir, f"{vid}.{ext}") transcript = fetch_transcript( video_id=vid, languages=languages, output_format=output_format, preserve_timestamps=preserve_timestamps, output_file=out_file, ) results[vid] = {"status": "ok", "transcript": transcript} except SystemExit: results[vid] = {"status": "error", "transcript": None} return results def parse_args(): parser = argparse.ArgumentParser( description="Fetch YouTube video transcripts directly — no scraping required.", formatter_class=argparse.RawTextHelpFormatter, ) parser.add_argument( "video", nargs="+", help="YouTube video URL(s) or video ID(s).", ) parser.add_argument( "-l", "--languages", nargs="+", default=["en"], metavar="LANG", help="Language codes in order of preference (default: en).\nExample: --languages en es fr", ) parser.add_argument( "-f", "--format", choices=["text", "json", "srt", "vtt"], default="text", help="Output format (default: text).", ) parser.add_argument( "-t", "--timestamps", action="store_true", help="Include timestamps in plain-text output.", ) parser.add_argument( "-o", "--output", metavar="PATH", help="Output file path (single video) or directory (multiple videos).", ) parser.add_argument( "--list", action="store_true", help="List all available transcript languages for the video(s) and exit.", ) return parser.parse_args() def main(): args = parse_args() video_ids = [extract_video_id(v) for v in args.video] if args.list: for vid in video_ids: list_available_transcripts(vid) return if len(video_ids) == 1: transcript = fetch_transcript( video_id=video_ids[0], languages=args.languages, output_format=args.format, preserve_timestamps=args.timestamps, output_file=args.output, ) if not args.output: print(transcript) else: results = fetch_multiple( video_ids=video_ids, languages=args.languages, output_format=args.format, preserve_timestamps=args.timestamps, output_dir=args.output, ) if not args.output: for vid, data in results.items(): print(f"\n{'='*60}") print(f"Video ID: {vid}") print(f"{'='*60}") if data["status"] == "ok": print(data["transcript"]) else: print("Failed to retrieve transcript.") if __name__ == "__main__": main()