TaxReturn-AgenticRAG / rag_smolagent.py

Upload folder using huggingface_hub

ea8a378 verified 11 months ago

5.98 kB

	import os
	import constants as cte

	from dotenv import load_dotenv
	from reportlab.lib.pagesizes import letter
	from reportlab.lib.styles import getSampleStyleSheet
	from reportlab.lib.units import inch
	from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
	from smolagents import CodeAgent, LiteLLMModel, Tool, MessageRole

	load_dotenv() # take environment variables from .env.

	LLM_BASE = os.getenv("AZURE_OPENAI_BASE")
	LLM_VERSION = os.getenv("AZURE_OPENAI_VERSION")
	LLMI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
	LLM_NAME = os.getenv("AZURE_OPENAI_MODEL")
	EMBEDDING_BASE = os.getenv("AZURE_OPENAI_EMBEDDING_BASE")
	EMBEDDING_VERSION = os.getenv("AZURE_OPENAI_EMBEDDING_VERSION")
	EMBEDDING_API_KEY = os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY")
	EMBEDDING_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")

	CHROMA_PATH = "/teamspace/studios/this_studio/AgenticRAG/chroma_db"
	BM25_PATH = "philschmid/markdown-documentation-transformers"

	class ChromaRetrieverTool(Tool):
	name = "chroma_retriever"
	description = """Uses vector search to retrieve chunks of information from the “Manual de la Renta” document that might be more relevant to answering your query.
	Use the affirmative form rather than a question. If the age or residence is provided, be sure to include it in query to find the necessary information.
	For better results, searches must be for one specific data, for multiple concepts or different information, use multiple calls to the tool"""
	inputs = {
	"query": {
	"type": "string",
	"description": "The query to perform. This should be vector space close to your target documents.",
	}
	}
	output_type = "string"

	def __init__(self, path_to_database, top_k_results: int = 5, **kwargs):
	super().__init__(**kwargs)

	import chromadb

	self.top_k_results = top_k_results
	self.openai_embedding = (
	chromadb.utils.embedding_functions.OpenAIEmbeddingFunction(
	api_key=EMBEDDING_API_KEY,
	api_base=EMBEDDING_BASE,
	api_type="azure",
	api_version=EMBEDDING_VERSION,
	model_name=str(EMBEDDING_NAME).split("/")[-1],
	)
	)
	self.retriever_client = chromadb.PersistentClient(path=path_to_database)
	self.collection = self.retriever_client.get_or_create_collection(
	name="RENTA_2023_LARGE",
	embedding_function=self.openai_embedding,
	)

	def forward(self, query: str) -> str:
	assert isinstance(query, str), "Your search query must be a string"

	results = self.collection.query(
	query_texts=[query],
	n_results=self.top_k_results, # how many results to return
	)

	outout_str = "No information found"

	if "documents" in results and results["documents"] is not None:
	outout_str = "\nTop Retrieved documents:\n"
	for j, document in enumerate(results["documents"]):
	for i, doc in enumerate(document):
	doc_str = (
	f"\n\n===== Document {results['metadatas'][j][i]} =====\n"
	+ str(doc)
	)
	outout_str += doc_str
	# metadatas.append(results["metadatas"])

	return outout_str

	class GeneratePDFTool(Tool):
	name = "generate_pdf"
	description = "Generates a PDF document from the final answer."
	inputs = {
	"text": {
	"type": "string",
	"description": "The final answer to be included too in the PDF document.",
	}
	}
	output_type = "string"

	def forward(self, text: str) -> str:
	try:
	doc = SimpleDocTemplate("final_answer.pdf", pagesize=letter)
	styles = getSampleStyleSheet()
	story = []
	story.append(Paragraph(text, styles["Normal"]))
	story.append(Spacer(1, 0.2 * inch)) # Add a space below the content
	doc.build(story)
	return "PDF document 'final_answer.pdf' has been generated successfully."
	except Exception as e:
	return f"Error generating PDF: {str(e)}"


	class SmolAgent:
	def __init__(self):
	# Define LLM model to call.
	model = LiteLLMModel(
	model_id=str(os.getenv("AZURE_OPENAI_MODEL")),
	api_base=str(os.getenv("AZURE_OPENAI_BASE")),
	api_key=str(os.getenv("AZURE_OPENAI_API_KEY")),
	temperature = 0
	)

	# Create retriever with documents preprocessed.
	chroma_tool = ChromaRetrieverTool(path_to_database=CHROMA_PATH)
	generate_pdf_tool = GeneratePDFTool()

	self.agent = CodeAgent(
	tools=[chroma_tool, generate_pdf_tool],
	model=model,
	max_steps=8,
	add_base_tools=True,
	additional_authorized_imports=["chromadb"],
	)

	def __call__(self, query):
	return self.agent.run(query, reset=False)


	if __name__ == "__main__":
	agent = SmolAgent()

	# Call agent with different questions.
	question1 = "¿Tengo que declarar los bízums recibidos a lo largo del año pasado?"
	print(f"\nQuestion 1: {question1}")
	agent_output = agent(question1)
	print("======================================================================================================================================")
	print("======================================================================================================================================")
	#print(agent.agent.write_memory_to_messages())
	print("======================================================================================================================================")
	print("Agent response:\n", agent_output)