agents_course_final_assignement

Paused

App Files Files Community

agents_course_final_assignement / tools /multimedia_tools.py

drAbreu

Review agent added

227dcb0 12 months ago

raw

history blame contribute delete

9.52 kB

	import os
	from typing import Optional, Dict, Any
	from llama_index.readers.whisper import WhisperReader
	from llama_index.core.tools import FunctionTool
	from llama_index.core import SimpleDirectoryReader
	from llama_index.readers.file import (
	ImageReader
	)
	import base64
	import sys
	import traceback
	from PIL import Image
	from llama_index.llms.openai import OpenAI
	from llama_index.llms.anthropic import Anthropic

	class WhisperTranscriber:
	"""Class for transcribing audio using OpenAI's Whisper model."""

	def __init__(self, model: str = "whisper-1", api_key: Optional[str] = None):
	"""Initialize the WhisperTranscriber with specified model and API key."""
	self.api_key = api_key or os.getenv("OPENAI_API_KEY")
	self.model = model
	self.reader = WhisperReader(
	model=self.model,
	api_key=self.api_key,
	)

	def transcribe(self, audio_file_path: str) -> str:
	"""
	Transcribe an audio file to text.

	Args:
	audio_file_path: Path to the audio file (.mp3, .wav, etc.)

	Returns:
	Transcribed text from the audio file
	"""
	try:
	# Load data from audio file
	documents = self.reader.load_data(audio_file_path)

	# Extract and concatenate text from all returned documents
	if documents and len(documents) > 0:
	transcription = " ".join([doc.text for doc in documents if hasattr(doc, 'text')])
	return transcription
	return "No transcription was generated from the audio file."
	except Exception as e:
	return f"Error transcribing audio file: {str(e)}"


	# Initialize the transcriber
	whisper_transcriber = WhisperTranscriber()

	# Create a function tool for audio transcription
	transcribe_audio_tool = FunctionTool.from_defaults(
	name="transcribe_audio",
	description="Transcribes speech from an audio file to text using OpenAI's Whisper model. Provide the full path to the audio file.",
	fn=whisper_transcriber.transcribe
	)


	def encode_image_to_base64(file_path: str) -> str:
	"""
	Reads an image file and encodes it to a base64 string.

	This function focuses exclusively on generating a base64 encoded string from an image file.

	Args:
	file_path (str): Path to the image file to be encoded

	Returns:
	str: The base64 encoded string of the image

	Raises:
	FileNotFoundError: If the specified file doesn't exist
	ValueError: If the file has an unsupported extension

	Examples:
	>>> base64_data = encode_image_to_base64("data/photo.jpg")
	"""
	# Check if file exists
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"File not found at {file_path}")

	# Get file extension
	file_ext = os.path.splitext(file_path)[1].lower()
	supported_formats = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp']

	if file_ext not in supported_formats:
	raise ValueError(f"Unsupported file extension: {file_ext}. Supported extensions are: {', '.join(supported_formats)}")

	with open(file_path, "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read())
	base64_image = encoded_string.decode('utf-8')

	return base64_image

	# Create a function tool for image encoding
	encode_image_tool = FunctionTool.from_defaults(
	name="encode_image_to_base64",
	description="Reads an image file and converts it to a base64 encoded string. Use this tool to prepare images for vision analysis.",
	fn=encode_image_to_base64
	)

	class VisionAnalyzerAgent:
	"""
	A specialized agent for analyzing images using vision models.

	This agent can process images, analyze their content, and provide detailed descriptions
	or answer questions about the visual elements.
	"""

	def __init__(
	self,
	model_provider: str = "openai",
	model_name: str = "gpt-4o",
	api_key: Optional[str] = None,
	**kwargs
	):
	"""
	Initialize a VisionAnalyzerAgent.

	Args:
	model_provider: The LLM provider to use ("anthropic" or "openai")
	model_name: The specific model name to use
	api_key: API key for the provider (defaults to environment variable)
	**kwargs: Additional parameters for the model
	"""
	self.model_provider = model_provider.lower()
	self.model_name = model_name
	self.api_key = api_key

	# Set up the vision model client
	if self.model_provider == "anthropic":
	self.client = Anthropic(api_key=api_key or os.getenv("ANTHROPIC_API_KEY"))
	elif self.model_provider == "openai":
	self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))
	else:
	raise ValueError(f"Unsupported model provider: {model_provider}. "
	f"Supported providers are: anthropic, openai")

	def analyze_image(self, image_base64: str, query: str = "Describe this image in detail.") -> str:
	"""
	Analyze an image using the vision model.

	Args:
	image_base64: Base64 encoded image data
	query: The question or instruction for image analysis

	Returns:
	str: The analysis result from the vision model
	"""
	# Prepare the image for the appropriate model
	if self.model_provider == "anthropic":
	# Handle Anthropic Claude models
	try:
	# Determine MIME type based on image data
	mime_type = "image/jpeg" # Default
	if image_base64.startswith('/9j/'):
	mime_type = "image/jpeg"
	elif image_base64.startswith('iVBORw0KGgo'):
	mime_type = "image/png"

	# Create the message with image and text
	response = self.client.messages.create(
	model=self.model_name,
	max_tokens=1024,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": query
	},
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": mime_type,
	"data": image_base64
	}
	}
	]
	}
	]
	)
	return response.content[0].text

	except Exception as e:
	return f"Error analyzing image with Anthropic: {str(e)}"

	elif self.model_provider == "openai":
	# Handle OpenAI GPT-4 Vision models
	try:
	response = self.client.chat.completions.create(
	model=self.model_name,
	max_tokens=1024*20,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": query
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{image_base64}"
	}
	}
	]
	}
	]
	)
	return response.choices[0].message.content

	except Exception as e:
	return f"Error analyzing image with OpenAI: {str(e)}"

	else:
	return "Unsupported model provider"

	# Create a function tool for the vision analyzer
	def analyze_image_with_vision(image_path: str, query: str = "Describe this image in detail.") -> str:
	"""
	Analyze an image using a vision-enabled model.

	Args:
	image_path: Path to the image file
	query: The question or instruction for image analysis

	Returns:
	str: The analysis result from the vision model
	"""
	try:
	# Encode the image to base64
	base64_image = encode_image_to_base64(image_path)

	# Create a vision analyzer agent and analyze the image
	vision_agent = VisionAnalyzerAgent()
	result = vision_agent.analyze_image(base64_image, query)

	return result
	except Exception as e:
	return f"Error analyzing image: {str(e)}"

	# Create a function tool for vision analysis
	vision_analyzer_tool = FunctionTool.from_defaults(
	name="analyze_image_with_vision",
	description="Analyzes images using a vision-enabled model. Provide the image path and an optional query/instruction.",
	fn=analyze_image_with_vision
	)