### We first load a knowledge base on which we want to perform RAG import datasets from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.retrievers import BM25Retriever from huggingface_hub import login import os knowledge_base = datasets.load_dataset("SuccessfulCrab/web_content", split="train") source_docs = [ Document(page_content=doc["text"]) for doc in knowledge_base ] text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n", ".", " ", ""], ) docs_processed = text_splitter.split_documents(source_docs) ### Since we need to add a vectordb as an attribute of the tool, we cannot simply use the simple tool constructor with a @tool decorator. ### Therefore we will follow the advanced setup highlighted in the tools tutorial. from smolagents import Tool class RetrieverTool(Tool): name = "retriever" description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query." inputs = { "query": { "type": "string", "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.", } } output_type = "string" def __init__(self, docs, **kwargs): super().__init__(**kwargs) self.retriever = BM25Retriever.from_documents( docs, k=10 ) def forward(self, query: str) -> str: assert isinstance(query, str), "Your search query must be a string" docs = self.retriever.invoke( query, ) return "\nRetrieved documents:\n" + "".join( [ f"\n\n===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs) ] ) retriever_tool = RetrieverTool(docs_processed)