agentbee

Sleeping

App Files Files Community

agentbee / src /tools /youtube.py

mangubee

refactor: rename runtime folders with underscore prefix

04ece4c 4 months ago

raw

history blame

13 kB

	"""
	YouTube Transcript Tool - Extract transcripts from YouTube videos
	Author: @mangobee
	Date: 2026-01-13

	Provides YouTube video transcript extraction:
	- Primary: youtube-transcript-api (instant, 1-3 seconds)
	- Fallback: yt-dlp audio extraction + Whisper transcription (30s-2min)
	- Handles various YouTube URL formats (watch, youtu.be, shorts)
	- Returns clean transcript text for LLM analysis

	Workflow:
	YouTube URL
	├─ Has transcript? ✅ → Use youtube-transcript-api (instant)
	└─ No transcript? ❌ → Download audio + Whisper (slower, but works)

	Requirements:
	- youtube-transcript-api: pip install youtube-transcript-api
	- yt-dlp: pip install yt-dlp
	- openai-whisper: pip install openai-whisper (via src.tools.audio)
	"""

	import logging
	import os
	import re
	import tempfile
	from typing import Dict, Any, Optional
	from pathlib import Path

	# ============================================================================
	# CONFIG
	# ============================================================================
	# YouTube URL patterns
	YOUTUBE_PATTERNS = [
	r'(?:youtube\.com\/watch\?v=\|youtu\.be\/\|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})',
	]

	# Audio download settings
	AUDIO_FORMAT = "mp3"
	AUDIO_QUALITY = "128" # 128 kbps (sufficient for speech)

	# Temporary file cleanup
	CLEANUP_TEMP_FILES = True

	# ============================================================================
	# Logging Setup
	# ============================================================================
	logger = logging.getLogger(__name__)


	# ============================================================================
	# Transcript Cache
	# ============================================================================

	def save_transcript_to_cache(video_id: str, text: str, source: str) -> None:
	"""
	Save transcript to log/ folder for debugging.

	Args:
	video_id: YouTube video ID
	text: Transcript text
	source: "api" or "whisper"
	"""
	try:
	log_dir = Path("_log")
	log_dir.mkdir(exist_ok=True)

	cache_file = log_dir / f"{video_id}_transcript.txt"
	with open(cache_file, "w", encoding="utf-8") as f:
	f.write(f"# YouTube Transcript\n")
	f.write(f"# Video ID: {video_id}\n")
	f.write(f"# Source: {source}\n")
	f.write(f"# Length: {len(text)} characters\n")
	f.write(f"# Generated: {__import__('datetime').datetime.now().isoformat()}\n")
	f.write(f"\n{text}\n")

	logger.info(f"Transcript saved: {cache_file}")
	except Exception as e:
	logger.warning(f"Failed to save transcript: {e}")


	# ============================================================================
	# YouTube URL Parser
	# =============================================================================

	def extract_video_id(url: str) -> Optional[str]:
	"""
	Extract video ID from various YouTube URL formats.

	Supports:
	- youtube.com/watch?v=VIDEO_ID
	- youtu.be/VIDEO_ID
	- youtube.com/shorts/VIDEO_ID

	Args:
	url: YouTube URL

	Returns:
	Video ID (11 characters) or None if not found

	Examples:
	>>> extract_video_id("https://youtube.com/watch?v=dQw4w9WgXcQ")
	"dQw4w9WgXcQ"

	>>> extract_video_id("https://youtu.be/dQw4w9WgXcQ")
	"dQw4w9WgXcQ"
	"""
	if not url:
	return None

	for pattern in YOUTUBE_PATTERNS:
	match = re.search(pattern, url)
	if match:
	return match.group(1)

	return None


	# ============================================================================
	# Transcript Extraction (Primary Method)
	# =============================================================================

	def get_youtube_transcript(video_id: str) -> Dict[str, Any]:
	"""
	Get transcript using youtube-transcript-api.

	Args:
	video_id: YouTube video ID (11 characters)

	Returns:
	Dict with structure: {
	"text": str, # Transcript text
	"video_id": str, # Video ID
	"source": str, # "api" or "whisper"
	"success": bool, # True if transcription succeeded
	"error": str or None # Error message if failed
	}
	"""
	try:
	from youtube_transcript_api import YouTubeTranscriptApi

	logger.info(f"Fetching transcript for video: {video_id}")

	# Get transcript (auto-detect language, prefer English)
	# Note: fetch() is an instance method in newer versions
	api = YouTubeTranscriptApi()
	transcript_list = api.fetch(
	video_id,
	languages=['en', 'en-US', 'en-GB']
	)

	# Clean transcript: remove timestamps, combine segments
	text_parts = []
	for entry in transcript_list:
	text = entry.get('text', '').strip()
	if text:
	text_parts.append(text)

	text = ' '.join(text_parts)

	logger.info(f"Transcript fetched: {len(text)} characters")

	# Save to cache for debugging
	save_transcript_to_cache(video_id, text, "api")

	return {
	"text": text,
	"video_id": video_id,
	"source": "api",
	"success": True,
	"error": None
	}

	except Exception as e:
	error_msg = str(e)
	logger.error(f"YouTube transcript API failed: {error_msg}")

	# Check if error is "No transcript found" (expected for videos without captions)
	if "No transcript found" in error_msg or "Could not retrieve a transcript" in error_msg:
	return {
	"text": "",
	"video_id": video_id,
	"source": "api",
	"success": False,
	"error": "No transcript available (video may not have captions)"
	}

	return {
	"text": "",
	"video_id": video_id,
	"source": "api",
	"success": False,
	"error": f"Transcript API error: {error_msg}"
	}


	# ============================================================================
	# Audio Fallback (Secondary Method)
	# =============================================================================

	def download_audio(video_url: str) -> Optional[str]:
	"""
	Download audio from YouTube using yt-dlp.

	Args:
	video_url: Full YouTube URL

	Returns:
	Path to downloaded audio file or None if failed
	"""
	try:
	import yt_dlp

	logger.info(f"Downloading audio from: {video_url}")

	# Create temp file for audio
	temp_dir = tempfile.gettempdir()
	output_path = os.path.join(temp_dir, f"youtube_audio_{os.getpid()}.{AUDIO_FORMAT}")

	# yt-dlp options: audio only, best quality
	ydl_opts = {
	'format': 'bestaudio/best',
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': AUDIO_FORMAT,
	'preferredquality': AUDIO_QUALITY,
	}],
	'outtmpl': output_path.replace(f'.{AUDIO_FORMAT}', ''),
	'quiet': True,
	'no_warnings': True,
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([video_url])

	# yt-dlp adds .mp3 extension, adjust path
	actual_path = output_path if os.path.exists(output_path) else output_path

	if os.path.exists(actual_path):
	logger.info(f"Audio downloaded: {actual_path} ({os.path.getsize(actual_path)} bytes)")
	return actual_path
	else:
	# Find the file with the correct extension
	for file in os.listdir(temp_dir):
	if file.startswith(f"youtube_audio_{os.getpid()}"):
	actual_path = os.path.join(temp_dir, file)
	logger.info(f"Audio downloaded: {actual_path}")
	return actual_path

	logger.error("Audio file not found after download")
	return None

	except ImportError:
	logger.error("yt-dlp not installed. Run: pip install yt-dlp")
	return None
	except Exception as e:
	logger.error(f"Audio download failed: {e}")
	return None


	def transcribe_from_audio(video_url: str) -> Dict[str, Any]:
	"""
	Fallback: Download audio and transcribe with Whisper.

	Args:
	video_url: Full YouTube URL

	Returns:
	Dict with structure: {
	"text": str, # Transcript text
	"video_id": str, # Video ID
	"source": str, # "whisper"
	"success": bool, # True if transcription succeeded
	"error": str or None # Error message if failed
	}
	"""
	video_id = extract_video_id(video_url)

	if not video_id:
	return {
	"text": "",
	"video_id": "",
	"source": "whisper",
	"success": False,
	"error": "Invalid YouTube URL"
	}

	# Download audio
	audio_file = download_audio(video_url)

	if not audio_file:
	return {
	"text": "",
	"video_id": video_id,
	"source": "whisper",
	"success": False,
	"error": "Failed to download audio"
	}

	try:
	# Import transcribe_audio (avoid circular import)
	from src.tools.audio import transcribe_audio

	# Transcribe with Whisper
	result = transcribe_audio(audio_file)

	# Cleanup temp file
	if CLEANUP_TEMP_FILES:
	try:
	os.remove(audio_file)
	logger.info(f"Cleaned up temp file: {audio_file}")
	except Exception as e:
	logger.warning(f"Failed to cleanup temp file: {e}")

	if result["success"]:
	# Save to cache for debugging
	save_transcript_to_cache(video_id, result["text"], "whisper")

	return {
	"text": result["text"],
	"video_id": video_id,
	"source": "whisper",
	"success": True,
	"error": None
	}
	else:
	return {
	"text": "",
	"video_id": video_id,
	"source": "whisper",
	"success": False,
	"error": result.get("error", "Transcription failed")
	}

	except Exception as e:
	logger.error(f"Whisper transcription failed: {e}")
	return {
	"text": "",
	"video_id": video_id,
	"source": "whisper",
	"success": False,
	"error": f"Whisper transcription failed: {str(e)}"
	}


	# ============================================================================
	# Main API Function
	# =============================================================================

	def youtube_transcript(url: str) -> Dict[str, Any]:
	"""
	Extract transcript from YouTube video.

	Primary method: youtube-transcript-api (instant)
	Fallback method: Download audio + Whisper transcription (slower)

	Args:
	url: YouTube video URL (youtube.com, youtu.be, shorts)

	Returns:
	Dict with structure: {
	"text": str, # Transcript text
	"video_id": str, # Video ID
	"source": str, # "api" or "whisper"
	"success": bool, # True if transcription succeeded
	"error": str or None # Error message if failed
	}

	Raises:
	ValueError: If URL is not a valid YouTube URL

	Examples:
	>>> youtube_transcript("https://youtube.com/watch?v=dQw4w9WgXcQ")
	{"text": "Never gonna give you up...", "video_id": "dQw4w9WgXcQ", "source": "api", "success": True, "error": None}
	"""
	# Validate URL and extract video ID
	video_id = extract_video_id(url)

	if not video_id:
	logger.error(f"Invalid YouTube URL: {url}")
	return {
	"text": "",
	"video_id": "",
	"source": "none",
	"success": False,
	"error": f"Invalid YouTube URL: {url}"
	}

	logger.info(f"Processing YouTube video: {video_id}")

	# Try transcript API first (fast)
	result = get_youtube_transcript(video_id)

	if result["success"]:
	logger.info(f"Transcript retrieved via API: {len(result['text'])} characters")
	# Log transcript to file for debugging
	logger.info(f"Transcript content: {result['text'][:200]}...")
	return result

	# Fallback to audio transcription (slow but works)
	logger.info(f"Transcript API failed, trying audio transcription...")
	result = transcribe_from_audio(url)

	if result["success"]:
	logger.info(f"Transcript retrieved via Whisper: {len(result['text'])} characters")
	# Log full transcript for debugging
	logger.info(f"Full transcript: {result['text']}")
	else:
	logger.error(f"All transcript methods failed for video: {video_id}")

	return result