Final_Assignment_Template

Sleeping

Final_Assignment_Template / tools /searchtools.py

Add new tools and functionalities for audio transcription, code execution, document handling, image processing, and mathematical operations

d303e2f 9 months ago

raw

history blame contribute delete

4.83 kB

	from langchain_core.tools import tool
	from langchain_community.tools.tavily_search import TavilySearchResults
	from langchain_community.document_loaders import WikipediaLoader
	from langchain_community.document_loaders import ArxivLoader
	from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound # Added
	import os

	@tool
	def wiki_search(query: str) -> str:
	"""Search Wikipedia for a query and return maximum 2 results.
	Args:
	query: The search query."""
	search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
	for doc in search_docs
	]
	)
	return {"wiki_results": formatted_search_docs}


	@tool
	def web_search(query: str) -> str:
	"""Search Tavily for a query and return maximum 3 results.
	Args:
	query: The search query."""
	search_docs = TavilySearchResults(max_results=3).invoke({"query": query})
	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.get("url", "")}">\n{doc.get("content", doc.get("snippet", ""))}\n</Document>'
	for doc in search_docs
	]
	)
	return {"web_results": formatted_search_docs}


	@tool
	def arxiv_search(query: str) -> str:
	"""Search Arxiv for a query and return maximum 3 result.
	Args:
	query: The search query."""
	search_docs = ArxivLoader(query=query, load_max_docs=3).load()
	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata.get("source", "N/A")}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
	for doc in search_docs
	]
	)
	return {"arxiv_results": formatted_search_docs}


	@tool
	def get_youtube_transcript(youtube_url: str) -> str:
	"""Fetches the transcript for a given YouTube video URL using youtube-transcript-api directly.
	If the video has no transcript, it will return an error message. Then use web_search to find the transcript.
	Args:
	youtube_url: The URL of the YouTube video."""
	try:
	video_id = None
	if "watch?v=" in youtube_url:
	video_id = youtube_url.split("watch?v=")[1].split("&")[0]
	elif "youtu.be/" in youtube_url:
	video_id = youtube_url.split("youtu.be/")[1].split("?")[0]

	if not video_id:
	return "Error: Could not parse YouTube video ID from URL."

	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

	transcript = None
	try:
	# Try fetching English first if available, then any manual, then any generated
	transcript = transcript_list.find_manually_created_transcript(['en'])
	except NoTranscriptFound:
	try:
	transcript = transcript_list.find_generated_transcript(['en'])
	except NoTranscriptFound:
	# If English not found, try any manual transcript
	try:
	transcript = transcript_list.find_manually_created_transcript(transcript_list.languages)
	except NoTranscriptFound:
	# Finally, try any generated transcript
	try:
	transcript = transcript_list.find_generated_transcript(transcript_list.languages)
	except NoTranscriptFound:
	return "Error: No manual or auto-generated transcripts found for this video in any language."

	fetched_transcript = transcript.fetch()

	if not fetched_transcript:
	return "Could not retrieve transcript for the video. The video might not have transcripts available."

	# Changed item['text'] to item.text to handle cases where items are objects
	full_transcript = " ".join([item.text for item in fetched_transcript])

	# Returning the transcript text directly, wrapped in a dictionary similar to other tools
	return {"youtube_transcript": full_transcript}

	except TranscriptsDisabled:
	return "Error: Transcripts are disabled for this video."
	except NoTranscriptFound:
	return "Error: No transcripts found for this video (this should have been caught earlier, but good fallback)."
	except Exception as e:
	# Catching potential network errors or other API issues specifically
	if "HTTP Error 403" in str(e) or "Too Many Requests" in str(e):
	return f"Error: YouTube API request failed, possibly due to rate limiting or access restrictions: {str(e)}"
	return f"Error fetching YouTube transcript using youtube-transcript-api: {str(e)}"