Spaces:

taha454
/

AidMateLLM

Sleeping

AidMateLLM / RAG /RAG_Retrival.py

TahaFawzyElshrif

published first version

2ebf9ad 6 months ago

3.38 kB

	from tqdm import tqdm
	from tqdm.notebook import tqdm as tqdmk
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.llms import HuggingFaceHub
	from langchain import PromptTemplate
	from langchain.document_loaders import PyPDFLoader
	import pandas as pd
	import duckdb
	import numpy as np
	import os


	class RAG_Retrival:
	def __init__(self,db,model,embedder):
	self.conn = db
	self.model = model
	self.embedder = embedder

	def read_data(self,path_data):
	# Count total files first for tqdm's total
	total_files = sum(len(files) for _, _, files in os.walk(path_data))
	all_text = ""
	with tqdm(total=total_files, desc="Reading files", unit="file") as pbar:
	for root, dirs, files in os.walk(path_data):
	for file in files:
	full_path = os.path.join(root, file)
	if full_path.endswith(".txt"):
	all_text += self.load_text_file(full_path)
	elif full_path.endswith(".pdf"):
	all_text += self.load_pdf(full_path)


	pbar.update(1)

	return all_text


	def load_text_file(self,path):
	text = ""
	with open(path, 'r') as file:
	for line in file:
	text += line

	return text


	def load_pdf(self,pdf_folder):
	loader = PyPDFLoader(pdf_folder)
	pages = loader.load_and_split()
	text = "\n".join([doc.page_content for doc in pages])
	return text


	def text_splitter(self,text,chunk_size=1000,chunk_overlap=100,is_separator_regex=False):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	is_separator_regex=is_separator_regex,
	)
	docs = text_splitter.create_documents([text])
	for i, d in enumerate(docs):
	d.metadata = {"doc_id": i}
	return docs

	def prepare_text_df(self,docs):
	# Get the page_content from the documents and create a new list
	content_list = [doc.page_content for doc in docs]
	# Send one page_content at a time
	print("Making embedding...")
	embeddings = [self.embedder.embed(content) for content in tqdmk(content_list)]
	print("Finished embedding...")

	# Create a dataframe to ingest it to the database
	dataframe = pd.DataFrame({
	'page_content': content_list,
	'embeddings': embeddings})
	return dataframe

	def make_data_frame(self,path,chunk_size=1000,chunk_overlap=100,is_separator_regex=False):
	all_texts = self.read_data(path)
	docs = self.text_splitter(all_texts,chunk_size,chunk_overlap,is_separator_regex)
	dataframe = self.prepare_text_df(docs)
	self.upload_file(dataframe)
	return dataframe


	def upload_file(self,embedding_df,name='first_aid'):
	'''
	Upload data and close database to be commited
	'''
	self.conn.make_data_frame(embedding_df,name)
	self.conn.close()

	def query_relevant(self,user_query):
	embedded_query = self.embedder.embed(user_query)
	result = self.conn.get_relevant_docs(embedded_query)
	return result