Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / youtube_tool.py

cowrycode

Update youtube_tool.py

4793736 verified 6 months ago

raw

history blame

3.94 kB

	from urllib.parse import parse_qs, urlparse
	from llama_index.core.tools import FunctionTool
	from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable

	#loader = YoutubeTranscriptReader()
	yt_ap = YouTubeTranscriptApi()


	def extract_video_id(url: str) -> str:
	"""
	Extracts the video ID from a YouTube URL.
	Args:
	url (str): The full YouTube video URL.
	Returns:
	str: The extracted video ID or raises ValueError.
	"""
	patterns = [
	r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
	r"youtu\.be/([a-zA-Z0-9_-]{11})"
	]
	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	raise ValueError("Invalid YouTube URL or unable to extract video ID.")

	def get_youtube_transcript(url: str) -> str:
	"""
	Fetches the transcript text for a given YouTube video.
	Args:
	url (str): The YouTube video URL.
	Returns:
	str: Combined transcript text or an error message.
	"""
	try:
	video_id = extract_video_id(url)
	transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
	full_text = " ".join([entry["text"] for entry in transcript_list])
	return full_text.strip()[:2000] # Truncate to 2000 chars to prevent token overflow
	except TranscriptsDisabled:
	return "This video has transcripts disabled."
	except NoTranscriptFound:
	return "No transcript was found for this video."
	except Exception as e:
	return f"Transcript error: {str(e)}"

	youtube_tool = FunctionTool.from_defaults(get_youtube_transcript)


	def extract_video_id(url: str) -> str:
	"""
	Handles typical YouTube URLs:
	- https://www.youtube.com/watch?v=VIDEO_ID
	- https://youtu.be/VIDEO_ID
	- with extra query params
	"""
	parsed = urlparse(url)
	if parsed.hostname in {"www.youtube.com", "youtube.com"}:
	qs = parse_qs(parsed.query)
	if "v" in qs:
	return qs["v"][0]
	# fallback for youtu.be or raw IDs
	return parsed.path.lstrip("/")

	def fetch_youtube_transcript(video_url: str) -> str:
	"""
	Fetch YouTube transcript text for the given URL.
	In English language.
	"""
	video_id = extract_video_id(video_url)

	try:
	# ✅ call on the class, NOT an instance
	transcript_data = yt_ap.fetch(
	video_id=video_id,
	languages=["en"], #You can add as many languages, use yt_ap.list(video_id) function to get the langauges
	)

	#FROM TRANSCRIPT DATA, YOU CAN CREATE A OBJECT OF TRANSCRIPT SNIPET AND TIME
	arr = [snippet.text for snippet in transcript_data]
	return " ".join(arr)
	#return " ".join(entry["text"] for entry in arr)
	except Exception as e:
	return f"Error fetching video details: {str(e)}"

	def fetch_youtube_transcript_snippets(video_url: str) -> str:
	"""
	Fetch YouTube transcript snippets for the given URL.
	It gets the start-time, end-time and duration of each snippet.
	"""
	video_id = extract_video_id(video_url)

	try:
	# ✅ call on the class, NOT an instance
	transcript_data = yt_ap.fetch(
	video_id=video_id,
	languages=["en"], #You can add as many languages, use yt_ap.list(video_id) function to get the langauges
	)
	arr = [
	{"text": snippet.text, "duration": snippet.duration, "start": snippet.start}
	for snippet in transcript_data
	]
	return " ".join(f"Text: {entry['text']} Duration: {entry['duration']} StartTime: {entry['start']} <End>" for entry in arr)
	except Exception as e:
	return f"Error fetching video details: {str(e)}"

	youtube_transcript_tool = FunctionTool.from_defaults(fetch_youtube_transcript)
	youtube_transcript_snippet_tool = FunctionTool.from_defaults(fetch_youtube_transcript_snippets)