Agent_Course_Final_Assignment

Sleeping

App Files Files Community

Agent_Course_Final_Assignment / tools.py

giulia-fontanella

Update tools.py

77d6676 verified 9 months ago

raw

history blame

9.75 kB

	import base64
	import pandas as pd
	from langchain_core.messages import HumanMessage
	from langchain.tools import tool
	from langchain_community.tools.tavily_search import TavilySearchResults
	from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
	import yt_dlp
	import ffmpeg


	@tool
	def read_excel(file_path: str) -> str:
	"""
	Extract readable text from an Excel file (.xlsx or .xls).

	Args:
	file_path: Path to the Excel file.

	Returns:
	A string representation of all sheets and their content.
	"""
	try:
	df_dict = pd.read_excel(file_path, sheet_name=None) # Read all sheets
	result = []
	for sheet_name, sheet_df in df_dict.items():
	sheet_text = sheet_df.to_string(index=False)
	result.append(f"Sheet: {sheet_name}\n{sheet_text}")
	return "\n\n".join(result)

	except Exception as e:
	return f"Error reading Excel file: {str(e)}"


	@tool
	def read_python(file_path: str) -> str:
	"""
	Extract source code from a Python (.py) file.

	Args:
	file_path: Path to the Python file.

	Returns:
	A string containing the full source code of the file.
	"""
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read()
	except Exception as e:
	return f"Error reading Python file: {str(e)}"


	class ExtractTextFromImage:
	def __init__(self, multimodal_model):
	self.multimodal_model = multimodal_model

	def __call__(self, img_path: str) -> str:
	"""
	Extract text from an image file.

	Args:
	img_path: A string representing the path to an image (e.g., PNG, JPEG).

	Returns:
	A single string containing the concatenated text extracted from the image.
	"""
	all_text = ""
	try:
	# Read image and encode as base64
	with open(img_path, "rb") as image_file:
	image_bytes = image_file.read()

	image_base64 = base64.b64encode(image_bytes).decode("utf-8")

	# Prepare the prompt including the base64 image data
	message = [
	HumanMessage(
	content=[
	{
	"type": "text",
	"text": (
	"Extract all the text from this image. "
	"Return only the extracted text, no explanations."
	),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{image_base64}"
	},
	},
	]
	)
	]

	# Call the vision-capable model
	response = self.multimodal_model.invoke(message)

	# Append extracted text
	all_text += response.content + "\n\n"

	return all_text.strip()
	except Exception as e:
	error_msg = f"Error extracting text: {str(e)}"
	print(error_msg)
	return ""


	class DescribeImage:
	def __init__(self, multimodal_model):
	self.multimodal_model = multimodal_model

	def __call__(self, img_path: str, query: str) -> str:
	"""
	Generate a detailed description of an image.
	This function reads a image from an url, encodes it, and sends it to a
	vision-capable language model to obtain a comprehensive, natural language
	description of the image's content, including its objects, actions, and context,
	following a specific query.

	Args:
	img_path: A string representing the path to an image (e.g., PNG, JPEG).
	query: Information to extract from the image.

	Returns:
	A single string containing a detailed description of the image.
	"""
	try:
	# Read image and encode as base64
	with open(img_path, "rb") as image_file:
	image_bytes = image_file.read()

	image_base64 = base64.b64encode(image_bytes).decode("utf-8")

	# Prepare message payload
	message = [
	HumanMessage(
	content=[
	{
	"type": "text",
	"text": (
	f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ),
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{image_base64}"
	},
	},
	]
	)
	]
	response = self.multimodal_model.invoke(message)
	return response.content.strip()

	except Exception as e:
	error_msg = f"Error describing image: {str(e)}"
	print(error_msg)
	return ""


	class TranscribeAudio:
	def __init__(self, multimodal_model):
	self.multimodal_model = multimodal_model

	def __call__(self, audio_path: str, query:str) -> str:
	"""
	Transcribe an MP3 file.

	Args:
	audio_path: Path to the MP3 audio file.

	Returns:
	Transcribed text as a string.
	"""
	try:
	with open(audio_path, "rb") as audio_file:
	audio_bytes = audio_file.read()

	audio_data = AudioFile(
	mime_type="audio/mpeg", # MP3 MIME type
	data=audio_bytes
	)

	message = [
	HumanMessage(
	content=[
	{
	"type": "text",
	"text": (
	"Transcribe the speech from this audio file. "
	"Return only the transcribed text, with no extra commentary."
	),
	},
	{
	"type": "audio",
	"audio": audio_data,
	},
	]
	)
	]

	response = self.audio_llm.invoke(message)
	return response.content.strip()

	except Exception as e:
	error_msg = f"Error transcribing audio: {str(e)}"
	print(error_msg)
	return ""


	@tool
	def download_youtube_video(youtube_url: str, output_path: str) -> str:
	"""
	Download a YouTube video as an MP4 file.

	Args:
	youtube_url: The YouTube video URL.
	output_path: Desired output path for the downloaded MP4 file.

	Returns:
	Path to the saved video file.
	"""
	ydl_opts = {
	'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
	'outtmpl': output_path,
	'merge_output_format': 'mp4',
	'quiet': True,
	}
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([youtube_url])
	return output_path


	@tool
	def extract_audio_from_video(video_path: str, audio_output: str) -> str:
	"""
	Extracts audio from an MP4 video file and saves it as MP3.

	Args:
	video_path: Path to the input MP4 video file.
	audio_output: Path for the output MP3 file.

	Returns:
	Path to the audio file.
	"""
	try:
	(
	ffmpeg
	.input(video_path)
	.output(audio_output, format='mp3', acodec='libmp3lame', t=60) # limit to 60 sec
	.overwrite_output()
	.run(quiet=True)
	)
	return audio_output
	except ffmpeg.Error as e:
	raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}") from e


	@tool
	def wiki_search(query: str) -> str:
	"""Search Wikipedia for a query and return maximum 2 results.

	Args:
	query: The search query."""
	search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
	for doc in search_docs
	])
	return {"wiki_results": formatted_search_docs}


	@tool
	def web_search(query: str) -> str:
	"""Search Tavily for a query and return maximum 3 results.

	Args:
	query: The search query."""
	search_docs = TavilySearchResults(max_results=3).invoke(query)
	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
	for doc in search_docs
	])
	return {"web_results": formatted_search_docs}


	@tool
	def arxiv_search(query: str) -> str:
	"""Search Arxiv for a query and return maximum 3 result.

	Args:
	query: The search query."""
	search_docs = ArxivLoader(query=query, load_max_docs=3).load()
	formatted_search_docs = "\n\n---\n\n".join(
	[
	f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
	for doc in search_docs
	])
	return {"arvix_results": formatted_search_docs}