Spaces:

Naveen-2007
/

perplexity-clone

Running

App Files Files Community

perplexity-clone / tools /youtube_tool.py

Naveen-2007

Add web search fallback for Video Brain when YouTube transcript fails due to network issues

8e5db5f 3 months ago

raw

history blame contribute delete

6.28 kB

	# tools/youtube_tool.py
	"""
	YouTube Transcript Extraction Tool
	Extracts transcripts from YouTube videos for Video Brain mode.
	"""

	import re
	from typing import Dict, Optional

	# Try to import youtube_transcript_api, but handle if it fails
	try:
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api._errors import (
	TranscriptsDisabled,
	NoTranscriptFound,
	VideoUnavailable
	)
	YOUTUBE_API_AVAILABLE = True
	except ImportError:
	YOUTUBE_API_AVAILABLE = False
	print("⚠️ youtube-transcript-api not available")


	class YouTubeTool:
	"""Extract transcripts and metadata from YouTube videos."""

	def extract_video_id(self, url: str) -> Optional[str]:
	"""Extract video ID from various YouTube URL formats."""
	patterns = [
	r'(?:v=\|/v/\|youtu\.be/\|/embed/)([a-zA-Z0-9_-]{11})',
	r'([a-zA-Z0-9_-]{11})'
	]

	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	return None

	def get_transcript(self, video_url: str) -> Dict:
	"""
	Get transcript from a YouTube video.

	Returns:
	Dict with keys:
	- success: bool
	- transcript: str (full transcript text)
	- segments: list of {text, start, duration}
	- video_id: str
	- error: str (if failed)
	"""
	video_id = self.extract_video_id(video_url)

	if not video_id:
	return {
	"success": False,
	"error": "Could not extract video ID from URL",
	"transcript": "",
	"segments": [],
	"video_id": None
	}

	if not YOUTUBE_API_AVAILABLE:
	return {
	"success": False,
	"error": "YouTube transcript API not available",
	"transcript": "",
	"segments": [],
	"video_id": video_id
	}

	try:
	# Try to get transcript (auto-generated or manual)
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

	# Try to find English transcript first
	transcript = None
	try:
	transcript = transcript_list.find_transcript(['en', 'en-US', 'en-GB'])
	except:
	# Fall back to any available transcript, translated to English
	try:
	for t in transcript_list:
	transcript = t.translate('en')
	break
	except:
	# Just get any transcript
	for t in transcript_list:
	transcript = t
	break

	if transcript:
	segments = transcript.fetch()

	# Build full transcript text with timestamps
	full_text_parts = []
	for seg in segments:
	start_time = int(seg['start'])
	minutes = start_time // 60
	seconds = start_time % 60
	timestamp = f"[{minutes}:{seconds:02d}]"
	full_text_parts.append(f"{timestamp} {seg['text']}")

	full_transcript = "\n".join(full_text_parts)

	# Also create a clean version without timestamps
	clean_text = " ".join([seg['text'] for seg in segments])

	return {
	"success": True,
	"transcript": full_transcript,
	"clean_transcript": clean_text,
	"segments": segments,
	"video_id": video_id,
	"error": None
	}
	else:
	return {
	"success": False,
	"error": "No transcript available for this video",
	"transcript": "",
	"segments": [],
	"video_id": video_id
	}

	except TranscriptsDisabled if YOUTUBE_API_AVAILABLE else Exception:
	return {
	"success": False,
	"error": "Transcripts are disabled for this video",
	"transcript": "",
	"segments": [],
	"video_id": video_id
	}
	except NoTranscriptFound if YOUTUBE_API_AVAILABLE else Exception:
	return {
	"success": False,
	"error": "No transcript found for this video",
	"transcript": "",
	"segments": [],
	"video_id": video_id
	}
	except VideoUnavailable if YOUTUBE_API_AVAILABLE else Exception:
	return {
	"success": False,
	"error": "Video is unavailable",
	"transcript": "",
	"segments": [],
	"video_id": video_id
	}
	except Exception as e:
	error_msg = str(e)
	# Check for network errors
	if "NameResolutionError" in error_msg or "Failed to resolve" in error_msg:
	return {
	"success": False,
	"error": "Network error: Cannot connect to YouTube (DNS resolution failed)",
	"transcript": "",
	"segments": [],
	"video_id": video_id,
	"network_error": True
	}
	return {
	"success": False,
	"error": f"Error fetching transcript: {error_msg[:200]}",
	"transcript": "",
	"segments": [],
	"video_id": video_id
	}

	def get_video_info(self, video_url: str) -> Dict:
	"""Get basic video info by searching."""
	video_id = self.extract_video_id(video_url)
	return {
	"video_id": video_id,
	"url": video_url,
	"embed_url": f"https://www.youtube.com/embed/{video_id}" if video_id else None
	}