Spaces:
Running
Running
| # tools/youtube_tool.py | |
| """ | |
| YouTube Transcript Extraction Tool | |
| Extracts transcripts from YouTube videos for Video Brain mode. | |
| """ | |
| import re | |
| from typing import Dict, Optional | |
| # Try to import youtube_transcript_api, but handle if it fails | |
| try: | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from youtube_transcript_api._errors import ( | |
| TranscriptsDisabled, | |
| NoTranscriptFound, | |
| VideoUnavailable | |
| ) | |
| YOUTUBE_API_AVAILABLE = True | |
| except ImportError: | |
| YOUTUBE_API_AVAILABLE = False | |
| print("⚠️ youtube-transcript-api not available") | |
| class YouTubeTool: | |
| """Extract transcripts and metadata from YouTube videos.""" | |
| def extract_video_id(self, url: str) -> Optional[str]: | |
| """Extract video ID from various YouTube URL formats.""" | |
| patterns = [ | |
| r'(?:v=|/v/|youtu\.be/|/embed/)([a-zA-Z0-9_-]{11})', | |
| r'([a-zA-Z0-9_-]{11})' | |
| ] | |
| for pattern in patterns: | |
| match = re.search(pattern, url) | |
| if match: | |
| return match.group(1) | |
| return None | |
| def get_transcript(self, video_url: str) -> Dict: | |
| """ | |
| Get transcript from a YouTube video. | |
| Returns: | |
| Dict with keys: | |
| - success: bool | |
| - transcript: str (full transcript text) | |
| - segments: list of {text, start, duration} | |
| - video_id: str | |
| - error: str (if failed) | |
| """ | |
| video_id = self.extract_video_id(video_url) | |
| if not video_id: | |
| return { | |
| "success": False, | |
| "error": "Could not extract video ID from URL", | |
| "transcript": "", | |
| "segments": [], | |
| "video_id": None | |
| } | |
| if not YOUTUBE_API_AVAILABLE: | |
| return { | |
| "success": False, | |
| "error": "YouTube transcript API not available", | |
| "transcript": "", | |
| "segments": [], | |
| "video_id": video_id | |
| } | |
| try: | |
| # Try to get transcript (auto-generated or manual) | |
| transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) | |
| # Try to find English transcript first | |
| transcript = None | |
| try: | |
| transcript = transcript_list.find_transcript(['en', 'en-US', 'en-GB']) | |
| except: | |
| # Fall back to any available transcript, translated to English | |
| try: | |
| for t in transcript_list: | |
| transcript = t.translate('en') | |
| break | |
| except: | |
| # Just get any transcript | |
| for t in transcript_list: | |
| transcript = t | |
| break | |
| if transcript: | |
| segments = transcript.fetch() | |
| # Build full transcript text with timestamps | |
| full_text_parts = [] | |
| for seg in segments: | |
| start_time = int(seg['start']) | |
| minutes = start_time // 60 | |
| seconds = start_time % 60 | |
| timestamp = f"[{minutes}:{seconds:02d}]" | |
| full_text_parts.append(f"{timestamp} {seg['text']}") | |
| full_transcript = "\n".join(full_text_parts) | |
| # Also create a clean version without timestamps | |
| clean_text = " ".join([seg['text'] for seg in segments]) | |
| return { | |
| "success": True, | |
| "transcript": full_transcript, | |
| "clean_transcript": clean_text, | |
| "segments": segments, | |
| "video_id": video_id, | |
| "error": None | |
| } | |
| else: | |
| return { | |
| "success": False, | |
| "error": "No transcript available for this video", | |
| "transcript": "", | |
| "segments": [], | |
| "video_id": video_id | |
| } | |
| except TranscriptsDisabled if YOUTUBE_API_AVAILABLE else Exception: | |
| return { | |
| "success": False, | |
| "error": "Transcripts are disabled for this video", | |
| "transcript": "", | |
| "segments": [], | |
| "video_id": video_id | |
| } | |
| except NoTranscriptFound if YOUTUBE_API_AVAILABLE else Exception: | |
| return { | |
| "success": False, | |
| "error": "No transcript found for this video", | |
| "transcript": "", | |
| "segments": [], | |
| "video_id": video_id | |
| } | |
| except VideoUnavailable if YOUTUBE_API_AVAILABLE else Exception: | |
| return { | |
| "success": False, | |
| "error": "Video is unavailable", | |
| "transcript": "", | |
| "segments": [], | |
| "video_id": video_id | |
| } | |
| except Exception as e: | |
| error_msg = str(e) | |
| # Check for network errors | |
| if "NameResolutionError" in error_msg or "Failed to resolve" in error_msg: | |
| return { | |
| "success": False, | |
| "error": "Network error: Cannot connect to YouTube (DNS resolution failed)", | |
| "transcript": "", | |
| "segments": [], | |
| "video_id": video_id, | |
| "network_error": True | |
| } | |
| return { | |
| "success": False, | |
| "error": f"Error fetching transcript: {error_msg[:200]}", | |
| "transcript": "", | |
| "segments": [], | |
| "video_id": video_id | |
| } | |
| def get_video_info(self, video_url: str) -> Dict: | |
| """Get basic video info by searching.""" | |
| video_id = self.extract_video_id(video_url) | |
| return { | |
| "video_id": video_id, | |
| "url": video_url, | |
| "embed_url": f"https://www.youtube.com/embed/{video_id}" if video_id else None | |
| } | |