Spaces:

pratham0011
/

Scam-Shield-Agent

Sleeping

App Files Files Community

Scam-Shield-Agent / rag.py

pratham0011

Upload 4 files

7f4abc9 verified 11 months ago

raw

history blame contribute delete

2.11 kB

	### We first load a knowledge base on which we want to perform RAG

	import datasets
	from langchain.docstore.document import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.retrievers import BM25Retriever
	from huggingface_hub import login
	import os

	knowledge_base = datasets.load_dataset("SuccessfulCrab/web_content", split="train")

	source_docs = [
	Document(page_content=doc["text"])
	for doc in knowledge_base
	]

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=50,
	add_start_index=True,
	strip_whitespace=True,
	separators=["\n\n", "\n", ".", " ", ""],
	)
	docs_processed = text_splitter.split_documents(source_docs)

	### Since we need to add a vectordb as an attribute of the tool, we cannot simply use the simple tool constructor with a @tool decorator.
	### Therefore we will follow the advanced setup highlighted in the tools tutorial.

	from smolagents import Tool

	class RetrieverTool(Tool):
	name = "retriever"
	description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query."
	inputs = {
	"query": {
	"type": "string",
	"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
	}
	}
	output_type = "string"

	def __init__(self, docs, **kwargs):
	super().__init__(**kwargs)
	self.retriever = BM25Retriever.from_documents(
	docs, k=10
	)

	def forward(self, query: str) -> str:
	assert isinstance(query, str), "Your search query must be a string"

	docs = self.retriever.invoke(
	query,
	)
	return "\nRetrieved documents:\n" + "".join(
	[
	f"\n\n===== Document {str(i)} =====\n" + doc.page_content
	for i, doc in enumerate(docs)
	]
	)

	retriever_tool = RetrieverTool(docs_processed)