File size: 10,109 Bytes
297ee7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env python3
"""

YouTube Transcript Fetcher

Fetches transcripts directly from YouTube videos using the YouTube Transcript API.

No HTML parsing or scraping involved.

"""

import argparse
import json
import sys
import re
from typing import Optional

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import (
    TextFormatter,
    JSONFormatter,
    SRTFormatter,
    WebVTTFormatter,
)
from youtube_transcript_api._errors import (
    TranscriptsDisabled,
    NoTranscriptFound,
    VideoUnavailable,
    CouldNotRetrieveTranscript,
)


def extract_video_id(url_or_id: str) -> str:
    """

    Extract the video ID from a YouTube URL or return it directly if already an ID.



    Supports formats:

        - https://www.youtube.com/watch?v=VIDEO_ID

        - https://youtu.be/VIDEO_ID

        - https://www.youtube.com/shorts/VIDEO_ID

        - https://www.youtube.com/embed/VIDEO_ID

        - VIDEO_ID (raw)

    """
    patterns = [
        r"(?:youtube\.com/watch\?.*v=)([a-zA-Z0-9_-]{11})",
        r"(?:youtu\.be/)([a-zA-Z0-9_-]{11})",
        r"(?:youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
        r"(?:youtube\.com/embed/)([a-zA-Z0-9_-]{11})",
    ]
    for pattern in patterns:
        match = re.search(pattern, url_or_id)
        if match:
            return match.group(1)

    # Assume raw video ID if it looks like one
    if re.fullmatch(r"[a-zA-Z0-9_-]{11}", url_or_id):
        return url_or_id

    raise ValueError(
        f"Could not extract a valid YouTube video ID from: {url_or_id}\n"
        "Accepted formats: full URL, youtu.be short link, or raw 11-character video ID."
    )


def list_available_transcripts(video_id: str) -> None:
    """List all available transcript languages for a video."""
    api = YouTubeTranscriptApi()
    transcript_list = api.list(video_id)

    print(f"\nAvailable transcripts for video: {video_id}\n")

    manually_created = list(transcript_list._manually_created_transcripts.values())
    auto_generated = list(transcript_list._generated_transcripts.values())

    if manually_created:
        print("Manually created:")
        for t in manually_created:
            print(f"  [{t.language_code}] {t.language}")

    if auto_generated:
        print("Auto-generated:")
        for t in auto_generated:
            print(f"  [{t.language_code}] {t.language} (auto)")

    if not manually_created and not auto_generated:
        print("  No transcripts found.")


def fetch_transcript(

    video_id: str,

    languages: Optional[list] = None,

    output_format: str = "text",

    preserve_timestamps: bool = False,

    output_file: Optional[str] = None,

) -> str:
    """

    Fetch transcript for a given video ID.



    Args:

        video_id:            YouTube video ID.

        languages:           Ordered list of language codes to try (e.g. ['en', 'es']).

                             Falls back to the first available transcript if None.

        output_format:       One of 'text', 'json', 'srt', 'vtt'.

        preserve_timestamps: Include timestamps in plain-text output.

        output_file:         If provided, write transcript to this file path.



    Returns:

        The transcript as a formatted string.

    """
    if languages is None:
        languages = ["en"]

    try:
        api = YouTubeTranscriptApi()
        transcript_list = api.list(video_id)
        
        # Try requested languages first; fall back to any available transcript
        try:
            transcript = transcript_list.find_transcript(languages)
        except NoTranscriptFound:
            # Grab whatever is available
            all_transcripts = list(transcript_list)

            if not all_transcripts:
                print(f"Error: No transcript is available for video '{video_id}'.", file=sys.stderr)
                sys.exit(1)

            transcript = all_transcripts[0]
            print(
                f"Warning: None of the requested languages found. "
                f"Using [{transcript.language_code}] {transcript.language} instead.",
                file=sys.stderr,
            )

        transcript_data = transcript.fetch()

        # Format
        if output_format == "json":
            formatter = JSONFormatter()
            result = formatter.format_transcript(transcript_data, indent=2)

        elif output_format == "srt":
            formatter = SRTFormatter()
            result = formatter.format_transcript(transcript_data)

        elif output_format == "vtt":
            formatter = WebVTTFormatter()
            result = formatter.format_transcript(transcript_data)

        else:  # default: plain text
            if preserve_timestamps:
                lines = []
                for entry in transcript_data:
                    minutes = int(entry["start"] // 60)
                    seconds = entry["start"] % 60
                    lines.append(f"[{minutes:02d}:{seconds:05.2f}] {entry['text']}")
                result = "\n".join(lines)
            else:
                formatter = TextFormatter()
                result = formatter.format_transcript(transcript_data)

        if output_file:
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(result)
            print(f"Transcript saved to: {output_file}")

        return result

    except TranscriptsDisabled:
        print(f"Error: Transcripts are disabled for video '{video_id}'.", file=sys.stderr)
        sys.exit(1)
    except VideoUnavailable:
        print(f"Error: Video '{video_id}' is unavailable or does not exist.", file=sys.stderr)
        sys.exit(1)
    except CouldNotRetrieveTranscript as e:
        print(f"Error for video '{video_id}': {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}", file=sys.stderr)
        sys.exit(1)


def fetch_multiple(

    video_ids: list,

    languages: Optional[list] = None,

    output_format: str = "text",

    preserve_timestamps: bool = False,

    output_dir: Optional[str] = None,

) -> dict:
    """

    Fetch transcripts for multiple video IDs.



    Args:

        video_ids:    List of YouTube video IDs.

        languages:    Language preference list.

        output_format: Output format string.

        preserve_timestamps: Include timestamps.

        output_dir:   Directory to save individual transcript files.



    Returns:

        Dictionary mapping video_id -> transcript string (or error message).

    """
    import os

    results = {}
    for vid in video_ids:
        print(f"Fetching: {vid}", file=sys.stderr)
        try:
            out_file = None
            if output_dir:
                ext_map = {"text": "txt", "json": "json", "srt": "srt", "vtt": "vtt"}
                ext = ext_map.get(output_format, "txt")
                os.makedirs(output_dir, exist_ok=True)
                out_file = os.path.join(output_dir, f"{vid}.{ext}")

            transcript = fetch_transcript(
                video_id=vid,
                languages=languages,
                output_format=output_format,
                preserve_timestamps=preserve_timestamps,
                output_file=out_file,
            )
            results[vid] = {"status": "ok", "transcript": transcript}
        except SystemExit:
            results[vid] = {"status": "error", "transcript": None}

    return results


def parse_args():
    parser = argparse.ArgumentParser(
        description="Fetch YouTube video transcripts directly — no scraping required.",
        formatter_class=argparse.RawTextHelpFormatter,
    )

    parser.add_argument(
        "video",
        nargs="+",
        help="YouTube video URL(s) or video ID(s).",
    )

    parser.add_argument(
        "-l", "--languages",
        nargs="+",
        default=["en"],
        metavar="LANG",
        help="Language codes in order of preference (default: en).\nExample: --languages en es fr",
    )

    parser.add_argument(
        "-f", "--format",
        choices=["text", "json", "srt", "vtt"],
        default="text",
        help="Output format (default: text).",
    )

    parser.add_argument(
        "-t", "--timestamps",
        action="store_true",
        help="Include timestamps in plain-text output.",
    )

    parser.add_argument(
        "-o", "--output",
        metavar="PATH",
        help="Output file path (single video) or directory (multiple videos).",
    )

    parser.add_argument(
        "--list",
        action="store_true",
        help="List all available transcript languages for the video(s) and exit.",
    )

    return parser.parse_args()


def main():
    args = parse_args()

    video_ids = [extract_video_id(v) for v in args.video]

    if args.list:
        for vid in video_ids:
            list_available_transcripts(vid)
        return

    if len(video_ids) == 1:
        transcript = fetch_transcript(
            video_id=video_ids[0],
            languages=args.languages,
            output_format=args.format,
            preserve_timestamps=args.timestamps,
            output_file=args.output,
        )
        if not args.output:
            print(transcript)
    else:
        results = fetch_multiple(
            video_ids=video_ids,
            languages=args.languages,
            output_format=args.format,
            preserve_timestamps=args.timestamps,
            output_dir=args.output,
        )
        if not args.output:
            for vid, data in results.items():
                print(f"\n{'='*60}")
                print(f"Video ID: {vid}")
                print(f"{'='*60}")
                if data["status"] == "ok":
                    print(data["transcript"])
                else:
                    print("Failed to retrieve transcript.")


if __name__ == "__main__":
    main()