agentbee / src /tools /youtube.py
mangubee's picture
refactor: rename runtime folders with underscore prefix
04ece4c
raw
history blame
13 kB
"""
YouTube Transcript Tool - Extract transcripts from YouTube videos
Author: @mangobee
Date: 2026-01-13
Provides YouTube video transcript extraction:
- Primary: youtube-transcript-api (instant, 1-3 seconds)
- Fallback: yt-dlp audio extraction + Whisper transcription (30s-2min)
- Handles various YouTube URL formats (watch, youtu.be, shorts)
- Returns clean transcript text for LLM analysis
Workflow:
YouTube URL
├─ Has transcript? ✅ → Use youtube-transcript-api (instant)
└─ No transcript? ❌ → Download audio + Whisper (slower, but works)
Requirements:
- youtube-transcript-api: pip install youtube-transcript-api
- yt-dlp: pip install yt-dlp
- openai-whisper: pip install openai-whisper (via src.tools.audio)
"""
import logging
import os
import re
import tempfile
from typing import Dict, Any, Optional
from pathlib import Path
# ============================================================================
# CONFIG
# ============================================================================
# YouTube URL patterns
YOUTUBE_PATTERNS = [
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})',
]
# Audio download settings
AUDIO_FORMAT = "mp3"
AUDIO_QUALITY = "128" # 128 kbps (sufficient for speech)
# Temporary file cleanup
CLEANUP_TEMP_FILES = True
# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)
# ============================================================================
# Transcript Cache
# ============================================================================
def save_transcript_to_cache(video_id: str, text: str, source: str) -> None:
"""
Save transcript to log/ folder for debugging.
Args:
video_id: YouTube video ID
text: Transcript text
source: "api" or "whisper"
"""
try:
log_dir = Path("_log")
log_dir.mkdir(exist_ok=True)
cache_file = log_dir / f"{video_id}_transcript.txt"
with open(cache_file, "w", encoding="utf-8") as f:
f.write(f"# YouTube Transcript\n")
f.write(f"# Video ID: {video_id}\n")
f.write(f"# Source: {source}\n")
f.write(f"# Length: {len(text)} characters\n")
f.write(f"# Generated: {__import__('datetime').datetime.now().isoformat()}\n")
f.write(f"\n{text}\n")
logger.info(f"Transcript saved: {cache_file}")
except Exception as e:
logger.warning(f"Failed to save transcript: {e}")
# ============================================================================
# YouTube URL Parser
# =============================================================================
def extract_video_id(url: str) -> Optional[str]:
"""
Extract video ID from various YouTube URL formats.
Supports:
- youtube.com/watch?v=VIDEO_ID
- youtu.be/VIDEO_ID
- youtube.com/shorts/VIDEO_ID
Args:
url: YouTube URL
Returns:
Video ID (11 characters) or None if not found
Examples:
>>> extract_video_id("https://youtube.com/watch?v=dQw4w9WgXcQ")
"dQw4w9WgXcQ"
>>> extract_video_id("https://youtu.be/dQw4w9WgXcQ")
"dQw4w9WgXcQ"
"""
if not url:
return None
for pattern in YOUTUBE_PATTERNS:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
# ============================================================================
# Transcript Extraction (Primary Method)
# =============================================================================
def get_youtube_transcript(video_id: str) -> Dict[str, Any]:
"""
Get transcript using youtube-transcript-api.
Args:
video_id: YouTube video ID (11 characters)
Returns:
Dict with structure: {
"text": str, # Transcript text
"video_id": str, # Video ID
"source": str, # "api" or "whisper"
"success": bool, # True if transcription succeeded
"error": str or None # Error message if failed
}
"""
try:
from youtube_transcript_api import YouTubeTranscriptApi
logger.info(f"Fetching transcript for video: {video_id}")
# Get transcript (auto-detect language, prefer English)
# Note: fetch() is an instance method in newer versions
api = YouTubeTranscriptApi()
transcript_list = api.fetch(
video_id,
languages=['en', 'en-US', 'en-GB']
)
# Clean transcript: remove timestamps, combine segments
text_parts = []
for entry in transcript_list:
text = entry.get('text', '').strip()
if text:
text_parts.append(text)
text = ' '.join(text_parts)
logger.info(f"Transcript fetched: {len(text)} characters")
# Save to cache for debugging
save_transcript_to_cache(video_id, text, "api")
return {
"text": text,
"video_id": video_id,
"source": "api",
"success": True,
"error": None
}
except Exception as e:
error_msg = str(e)
logger.error(f"YouTube transcript API failed: {error_msg}")
# Check if error is "No transcript found" (expected for videos without captions)
if "No transcript found" in error_msg or "Could not retrieve a transcript" in error_msg:
return {
"text": "",
"video_id": video_id,
"source": "api",
"success": False,
"error": "No transcript available (video may not have captions)"
}
return {
"text": "",
"video_id": video_id,
"source": "api",
"success": False,
"error": f"Transcript API error: {error_msg}"
}
# ============================================================================
# Audio Fallback (Secondary Method)
# =============================================================================
def download_audio(video_url: str) -> Optional[str]:
"""
Download audio from YouTube using yt-dlp.
Args:
video_url: Full YouTube URL
Returns:
Path to downloaded audio file or None if failed
"""
try:
import yt_dlp
logger.info(f"Downloading audio from: {video_url}")
# Create temp file for audio
temp_dir = tempfile.gettempdir()
output_path = os.path.join(temp_dir, f"youtube_audio_{os.getpid()}.{AUDIO_FORMAT}")
# yt-dlp options: audio only, best quality
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': AUDIO_FORMAT,
'preferredquality': AUDIO_QUALITY,
}],
'outtmpl': output_path.replace(f'.{AUDIO_FORMAT}', ''),
'quiet': True,
'no_warnings': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
# yt-dlp adds .mp3 extension, adjust path
actual_path = output_path if os.path.exists(output_path) else output_path
if os.path.exists(actual_path):
logger.info(f"Audio downloaded: {actual_path} ({os.path.getsize(actual_path)} bytes)")
return actual_path
else:
# Find the file with the correct extension
for file in os.listdir(temp_dir):
if file.startswith(f"youtube_audio_{os.getpid()}"):
actual_path = os.path.join(temp_dir, file)
logger.info(f"Audio downloaded: {actual_path}")
return actual_path
logger.error("Audio file not found after download")
return None
except ImportError:
logger.error("yt-dlp not installed. Run: pip install yt-dlp")
return None
except Exception as e:
logger.error(f"Audio download failed: {e}")
return None
def transcribe_from_audio(video_url: str) -> Dict[str, Any]:
"""
Fallback: Download audio and transcribe with Whisper.
Args:
video_url: Full YouTube URL
Returns:
Dict with structure: {
"text": str, # Transcript text
"video_id": str, # Video ID
"source": str, # "whisper"
"success": bool, # True if transcription succeeded
"error": str or None # Error message if failed
}
"""
video_id = extract_video_id(video_url)
if not video_id:
return {
"text": "",
"video_id": "",
"source": "whisper",
"success": False,
"error": "Invalid YouTube URL"
}
# Download audio
audio_file = download_audio(video_url)
if not audio_file:
return {
"text": "",
"video_id": video_id,
"source": "whisper",
"success": False,
"error": "Failed to download audio"
}
try:
# Import transcribe_audio (avoid circular import)
from src.tools.audio import transcribe_audio
# Transcribe with Whisper
result = transcribe_audio(audio_file)
# Cleanup temp file
if CLEANUP_TEMP_FILES:
try:
os.remove(audio_file)
logger.info(f"Cleaned up temp file: {audio_file}")
except Exception as e:
logger.warning(f"Failed to cleanup temp file: {e}")
if result["success"]:
# Save to cache for debugging
save_transcript_to_cache(video_id, result["text"], "whisper")
return {
"text": result["text"],
"video_id": video_id,
"source": "whisper",
"success": True,
"error": None
}
else:
return {
"text": "",
"video_id": video_id,
"source": "whisper",
"success": False,
"error": result.get("error", "Transcription failed")
}
except Exception as e:
logger.error(f"Whisper transcription failed: {e}")
return {
"text": "",
"video_id": video_id,
"source": "whisper",
"success": False,
"error": f"Whisper transcription failed: {str(e)}"
}
# ============================================================================
# Main API Function
# =============================================================================
def youtube_transcript(url: str) -> Dict[str, Any]:
"""
Extract transcript from YouTube video.
Primary method: youtube-transcript-api (instant)
Fallback method: Download audio + Whisper transcription (slower)
Args:
url: YouTube video URL (youtube.com, youtu.be, shorts)
Returns:
Dict with structure: {
"text": str, # Transcript text
"video_id": str, # Video ID
"source": str, # "api" or "whisper"
"success": bool, # True if transcription succeeded
"error": str or None # Error message if failed
}
Raises:
ValueError: If URL is not a valid YouTube URL
Examples:
>>> youtube_transcript("https://youtube.com/watch?v=dQw4w9WgXcQ")
{"text": "Never gonna give you up...", "video_id": "dQw4w9WgXcQ", "source": "api", "success": True, "error": None}
"""
# Validate URL and extract video ID
video_id = extract_video_id(url)
if not video_id:
logger.error(f"Invalid YouTube URL: {url}")
return {
"text": "",
"video_id": "",
"source": "none",
"success": False,
"error": f"Invalid YouTube URL: {url}"
}
logger.info(f"Processing YouTube video: {video_id}")
# Try transcript API first (fast)
result = get_youtube_transcript(video_id)
if result["success"]:
logger.info(f"Transcript retrieved via API: {len(result['text'])} characters")
# Log transcript to file for debugging
logger.info(f"Transcript content: {result['text'][:200]}...")
return result
# Fallback to audio transcription (slow but works)
logger.info(f"Transcript API failed, trying audio transcription...")
result = transcribe_from_audio(url)
if result["success"]:
logger.info(f"Transcript retrieved via Whisper: {len(result['text'])} characters")
# Log full transcript for debugging
logger.info(f"Full transcript: {result['text']}")
else:
logger.error(f"All transcript methods failed for video: {video_id}")
return result