| """ |
| YouTube Transcript Tool - Extract transcripts from YouTube videos |
| Author: @mangobee |
| Date: 2026-01-13 |
| |
| Provides YouTube video transcript extraction: |
| - Primary: youtube-transcript-api (instant, 1-3 seconds) |
| - Fallback: yt-dlp audio extraction + Whisper transcription (30s-2min) |
| - Handles various YouTube URL formats (watch, youtu.be, shorts) |
| - Returns clean transcript text for LLM analysis |
| |
| Workflow: |
| YouTube URL |
| ├─ Has transcript? ✅ → Use youtube-transcript-api (instant) |
| └─ No transcript? ❌ → Download audio + Whisper (slower, but works) |
| |
| Requirements: |
| - youtube-transcript-api: pip install youtube-transcript-api |
| - yt-dlp: pip install yt-dlp |
| - openai-whisper: pip install openai-whisper (via src.tools.audio) |
| """ |
|
|
| import logging |
| import os |
| import re |
| import tempfile |
| from typing import Dict, Any, Optional |
| from pathlib import Path |
|
|
| |
| |
| |
| |
| YOUTUBE_PATTERNS = [ |
| r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})', |
| ] |
|
|
| |
| AUDIO_FORMAT = "mp3" |
| AUDIO_QUALITY = "128" |
|
|
| |
| CLEANUP_TEMP_FILES = True |
|
|
| |
| |
| |
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| |
| |
|
|
| def save_transcript_to_cache(video_id: str, text: str, source: str) -> None: |
| """ |
| Save transcript to log/ folder for debugging. |
| |
| Args: |
| video_id: YouTube video ID |
| text: Transcript text |
| source: "api" or "whisper" |
| """ |
| try: |
| log_dir = Path("_log") |
| log_dir.mkdir(exist_ok=True) |
|
|
| cache_file = log_dir / f"{video_id}_transcript.txt" |
| with open(cache_file, "w", encoding="utf-8") as f: |
| f.write(f"# YouTube Transcript\n") |
| f.write(f"# Video ID: {video_id}\n") |
| f.write(f"# Source: {source}\n") |
| f.write(f"# Length: {len(text)} characters\n") |
| f.write(f"# Generated: {__import__('datetime').datetime.now().isoformat()}\n") |
| f.write(f"\n{text}\n") |
|
|
| logger.info(f"Transcript saved: {cache_file}") |
| except Exception as e: |
| logger.warning(f"Failed to save transcript: {e}") |
|
|
|
|
| |
| |
| |
|
|
| def extract_video_id(url: str) -> Optional[str]: |
| """ |
| Extract video ID from various YouTube URL formats. |
| |
| Supports: |
| - youtube.com/watch?v=VIDEO_ID |
| - youtu.be/VIDEO_ID |
| - youtube.com/shorts/VIDEO_ID |
| |
| Args: |
| url: YouTube URL |
| |
| Returns: |
| Video ID (11 characters) or None if not found |
| |
| Examples: |
| >>> extract_video_id("https://youtube.com/watch?v=dQw4w9WgXcQ") |
| "dQw4w9WgXcQ" |
| |
| >>> extract_video_id("https://youtu.be/dQw4w9WgXcQ") |
| "dQw4w9WgXcQ" |
| """ |
| if not url: |
| return None |
|
|
| for pattern in YOUTUBE_PATTERNS: |
| match = re.search(pattern, url) |
| if match: |
| return match.group(1) |
|
|
| return None |
|
|
|
|
| |
| |
| |
|
|
| def get_youtube_transcript(video_id: str) -> Dict[str, Any]: |
| """ |
| Get transcript using youtube-transcript-api. |
| |
| Args: |
| video_id: YouTube video ID (11 characters) |
| |
| Returns: |
| Dict with structure: { |
| "text": str, # Transcript text |
| "video_id": str, # Video ID |
| "source": str, # "api" or "whisper" |
| "success": bool, # True if transcription succeeded |
| "error": str or None # Error message if failed |
| } |
| """ |
| try: |
| from youtube_transcript_api import YouTubeTranscriptApi |
|
|
| logger.info(f"Fetching transcript for video: {video_id}") |
|
|
| |
| |
| api = YouTubeTranscriptApi() |
| transcript_list = api.fetch( |
| video_id, |
| languages=['en', 'en-US', 'en-GB'] |
| ) |
|
|
| |
| text_parts = [] |
| for entry in transcript_list: |
| text = entry.get('text', '').strip() |
| if text: |
| text_parts.append(text) |
|
|
| text = ' '.join(text_parts) |
|
|
| logger.info(f"Transcript fetched: {len(text)} characters") |
|
|
| |
| save_transcript_to_cache(video_id, text, "api") |
|
|
| return { |
| "text": text, |
| "video_id": video_id, |
| "source": "api", |
| "success": True, |
| "error": None |
| } |
|
|
| except Exception as e: |
| error_msg = str(e) |
| logger.error(f"YouTube transcript API failed: {error_msg}") |
|
|
| |
| if "No transcript found" in error_msg or "Could not retrieve a transcript" in error_msg: |
| return { |
| "text": "", |
| "video_id": video_id, |
| "source": "api", |
| "success": False, |
| "error": "No transcript available (video may not have captions)" |
| } |
|
|
| return { |
| "text": "", |
| "video_id": video_id, |
| "source": "api", |
| "success": False, |
| "error": f"Transcript API error: {error_msg}" |
| } |
|
|
|
|
| |
| |
| |
|
|
| def download_audio(video_url: str) -> Optional[str]: |
| """ |
| Download audio from YouTube using yt-dlp. |
| |
| Args: |
| video_url: Full YouTube URL |
| |
| Returns: |
| Path to downloaded audio file or None if failed |
| """ |
| try: |
| import yt_dlp |
|
|
| logger.info(f"Downloading audio from: {video_url}") |
|
|
| |
| temp_dir = tempfile.gettempdir() |
| output_path = os.path.join(temp_dir, f"youtube_audio_{os.getpid()}.{AUDIO_FORMAT}") |
|
|
| |
| ydl_opts = { |
| 'format': 'bestaudio/best', |
| 'postprocessors': [{ |
| 'key': 'FFmpegExtractAudio', |
| 'preferredcodec': AUDIO_FORMAT, |
| 'preferredquality': AUDIO_QUALITY, |
| }], |
| 'outtmpl': output_path.replace(f'.{AUDIO_FORMAT}', ''), |
| 'quiet': True, |
| 'no_warnings': True, |
| } |
|
|
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
| ydl.download([video_url]) |
|
|
| |
| actual_path = output_path if os.path.exists(output_path) else output_path |
|
|
| if os.path.exists(actual_path): |
| logger.info(f"Audio downloaded: {actual_path} ({os.path.getsize(actual_path)} bytes)") |
| return actual_path |
| else: |
| |
| for file in os.listdir(temp_dir): |
| if file.startswith(f"youtube_audio_{os.getpid()}"): |
| actual_path = os.path.join(temp_dir, file) |
| logger.info(f"Audio downloaded: {actual_path}") |
| return actual_path |
|
|
| logger.error("Audio file not found after download") |
| return None |
|
|
| except ImportError: |
| logger.error("yt-dlp not installed. Run: pip install yt-dlp") |
| return None |
| except Exception as e: |
| logger.error(f"Audio download failed: {e}") |
| return None |
|
|
|
|
| def transcribe_from_audio(video_url: str) -> Dict[str, Any]: |
| """ |
| Fallback: Download audio and transcribe with Whisper. |
| |
| Args: |
| video_url: Full YouTube URL |
| |
| Returns: |
| Dict with structure: { |
| "text": str, # Transcript text |
| "video_id": str, # Video ID |
| "source": str, # "whisper" |
| "success": bool, # True if transcription succeeded |
| "error": str or None # Error message if failed |
| } |
| """ |
| video_id = extract_video_id(video_url) |
|
|
| if not video_id: |
| return { |
| "text": "", |
| "video_id": "", |
| "source": "whisper", |
| "success": False, |
| "error": "Invalid YouTube URL" |
| } |
|
|
| |
| audio_file = download_audio(video_url) |
|
|
| if not audio_file: |
| return { |
| "text": "", |
| "video_id": video_id, |
| "source": "whisper", |
| "success": False, |
| "error": "Failed to download audio" |
| } |
|
|
| try: |
| |
| from src.tools.audio import transcribe_audio |
|
|
| |
| result = transcribe_audio(audio_file) |
|
|
| |
| if CLEANUP_TEMP_FILES: |
| try: |
| os.remove(audio_file) |
| logger.info(f"Cleaned up temp file: {audio_file}") |
| except Exception as e: |
| logger.warning(f"Failed to cleanup temp file: {e}") |
|
|
| if result["success"]: |
| |
| save_transcript_to_cache(video_id, result["text"], "whisper") |
|
|
| return { |
| "text": result["text"], |
| "video_id": video_id, |
| "source": "whisper", |
| "success": True, |
| "error": None |
| } |
| else: |
| return { |
| "text": "", |
| "video_id": video_id, |
| "source": "whisper", |
| "success": False, |
| "error": result.get("error", "Transcription failed") |
| } |
|
|
| except Exception as e: |
| logger.error(f"Whisper transcription failed: {e}") |
| return { |
| "text": "", |
| "video_id": video_id, |
| "source": "whisper", |
| "success": False, |
| "error": f"Whisper transcription failed: {str(e)}" |
| } |
|
|
|
|
| |
| |
| |
|
|
| def youtube_transcript(url: str) -> Dict[str, Any]: |
| """ |
| Extract transcript from YouTube video. |
| |
| Primary method: youtube-transcript-api (instant) |
| Fallback method: Download audio + Whisper transcription (slower) |
| |
| Args: |
| url: YouTube video URL (youtube.com, youtu.be, shorts) |
| |
| Returns: |
| Dict with structure: { |
| "text": str, # Transcript text |
| "video_id": str, # Video ID |
| "source": str, # "api" or "whisper" |
| "success": bool, # True if transcription succeeded |
| "error": str or None # Error message if failed |
| } |
| |
| Raises: |
| ValueError: If URL is not a valid YouTube URL |
| |
| Examples: |
| >>> youtube_transcript("https://youtube.com/watch?v=dQw4w9WgXcQ") |
| {"text": "Never gonna give you up...", "video_id": "dQw4w9WgXcQ", "source": "api", "success": True, "error": None} |
| """ |
| |
| video_id = extract_video_id(url) |
|
|
| if not video_id: |
| logger.error(f"Invalid YouTube URL: {url}") |
| return { |
| "text": "", |
| "video_id": "", |
| "source": "none", |
| "success": False, |
| "error": f"Invalid YouTube URL: {url}" |
| } |
|
|
| logger.info(f"Processing YouTube video: {video_id}") |
|
|
| |
| result = get_youtube_transcript(video_id) |
|
|
| if result["success"]: |
| logger.info(f"Transcript retrieved via API: {len(result['text'])} characters") |
| |
| logger.info(f"Transcript content: {result['text'][:200]}...") |
| return result |
|
|
| |
| logger.info(f"Transcript API failed, trying audio transcription...") |
| result = transcribe_from_audio(url) |
|
|
| if result["success"]: |
| logger.info(f"Transcript retrieved via Whisper: {len(result['text'])} characters") |
| |
| logger.info(f"Full transcript: {result['text']}") |
| else: |
| logger.error(f"All transcript methods failed for video: {video_id}") |
|
|
| return result |
|
|