perplexity-clone / tools /youtube_tool.py
Naveen-2007's picture
Add web search fallback for Video Brain when YouTube transcript fails due to network issues
8e5db5f
# tools/youtube_tool.py
"""
YouTube Transcript Extraction Tool
Extracts transcripts from YouTube videos for Video Brain mode.
"""
import re
from typing import Dict, Optional
# Try to import youtube_transcript_api, but handle if it fails
try:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
TranscriptsDisabled,
NoTranscriptFound,
VideoUnavailable
)
YOUTUBE_API_AVAILABLE = True
except ImportError:
YOUTUBE_API_AVAILABLE = False
print("⚠️ youtube-transcript-api not available")
class YouTubeTool:
"""Extract transcripts and metadata from YouTube videos."""
def extract_video_id(self, url: str) -> Optional[str]:
"""Extract video ID from various YouTube URL formats."""
patterns = [
r'(?:v=|/v/|youtu\.be/|/embed/)([a-zA-Z0-9_-]{11})',
r'([a-zA-Z0-9_-]{11})'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def get_transcript(self, video_url: str) -> Dict:
"""
Get transcript from a YouTube video.
Returns:
Dict with keys:
- success: bool
- transcript: str (full transcript text)
- segments: list of {text, start, duration}
- video_id: str
- error: str (if failed)
"""
video_id = self.extract_video_id(video_url)
if not video_id:
return {
"success": False,
"error": "Could not extract video ID from URL",
"transcript": "",
"segments": [],
"video_id": None
}
if not YOUTUBE_API_AVAILABLE:
return {
"success": False,
"error": "YouTube transcript API not available",
"transcript": "",
"segments": [],
"video_id": video_id
}
try:
# Try to get transcript (auto-generated or manual)
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# Try to find English transcript first
transcript = None
try:
transcript = transcript_list.find_transcript(['en', 'en-US', 'en-GB'])
except:
# Fall back to any available transcript, translated to English
try:
for t in transcript_list:
transcript = t.translate('en')
break
except:
# Just get any transcript
for t in transcript_list:
transcript = t
break
if transcript:
segments = transcript.fetch()
# Build full transcript text with timestamps
full_text_parts = []
for seg in segments:
start_time = int(seg['start'])
minutes = start_time // 60
seconds = start_time % 60
timestamp = f"[{minutes}:{seconds:02d}]"
full_text_parts.append(f"{timestamp} {seg['text']}")
full_transcript = "\n".join(full_text_parts)
# Also create a clean version without timestamps
clean_text = " ".join([seg['text'] for seg in segments])
return {
"success": True,
"transcript": full_transcript,
"clean_transcript": clean_text,
"segments": segments,
"video_id": video_id,
"error": None
}
else:
return {
"success": False,
"error": "No transcript available for this video",
"transcript": "",
"segments": [],
"video_id": video_id
}
except TranscriptsDisabled if YOUTUBE_API_AVAILABLE else Exception:
return {
"success": False,
"error": "Transcripts are disabled for this video",
"transcript": "",
"segments": [],
"video_id": video_id
}
except NoTranscriptFound if YOUTUBE_API_AVAILABLE else Exception:
return {
"success": False,
"error": "No transcript found for this video",
"transcript": "",
"segments": [],
"video_id": video_id
}
except VideoUnavailable if YOUTUBE_API_AVAILABLE else Exception:
return {
"success": False,
"error": "Video is unavailable",
"transcript": "",
"segments": [],
"video_id": video_id
}
except Exception as e:
error_msg = str(e)
# Check for network errors
if "NameResolutionError" in error_msg or "Failed to resolve" in error_msg:
return {
"success": False,
"error": "Network error: Cannot connect to YouTube (DNS resolution failed)",
"transcript": "",
"segments": [],
"video_id": video_id,
"network_error": True
}
return {
"success": False,
"error": f"Error fetching transcript: {error_msg[:200]}",
"transcript": "",
"segments": [],
"video_id": video_id
}
def get_video_info(self, video_url: str) -> Dict:
"""Get basic video info by searching."""
video_id = self.extract_video_id(video_url)
return {
"video_id": video_id,
"url": video_url,
"embed_url": f"https://www.youtube.com/embed/{video_id}" if video_id else None
}