Spaces:
Sleeping
Sleeping
| from tqdm import tqdm | |
| from tqdm.notebook import tqdm as tqdmk | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.llms import HuggingFaceHub | |
| from langchain import PromptTemplate | |
| from langchain.document_loaders import PyPDFLoader | |
| import pandas as pd | |
| import duckdb | |
| import numpy as np | |
| import os | |
| class RAG_Retrival: | |
| def __init__(self,db,model,embedder): | |
| self.conn = db | |
| self.model = model | |
| self.embedder = embedder | |
| def read_data(self,path_data): | |
| # Count total files first for tqdm's total | |
| total_files = sum(len(files) for _, _, files in os.walk(path_data)) | |
| all_text = "" | |
| with tqdm(total=total_files, desc="Reading files", unit="file") as pbar: | |
| for root, dirs, files in os.walk(path_data): | |
| for file in files: | |
| full_path = os.path.join(root, file) | |
| if full_path.endswith(".txt"): | |
| all_text += self.load_text_file(full_path) | |
| elif full_path.endswith(".pdf"): | |
| all_text += self.load_pdf(full_path) | |
| pbar.update(1) | |
| return all_text | |
| def load_text_file(self,path): | |
| text = "" | |
| with open(path, 'r') as file: | |
| for line in file: | |
| text += line | |
| return text | |
| def load_pdf(self,pdf_folder): | |
| loader = PyPDFLoader(pdf_folder) | |
| pages = loader.load_and_split() | |
| text = "\n".join([doc.page_content for doc in pages]) | |
| return text | |
| def text_splitter(self,text,chunk_size=1000,chunk_overlap=100,is_separator_regex=False): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| is_separator_regex=is_separator_regex, | |
| ) | |
| docs = text_splitter.create_documents([text]) | |
| for i, d in enumerate(docs): | |
| d.metadata = {"doc_id": i} | |
| return docs | |
| def prepare_text_df(self,docs): | |
| # Get the page_content from the documents and create a new list | |
| content_list = [doc.page_content for doc in docs] | |
| # Send one page_content at a time | |
| print("Making embedding...") | |
| embeddings = [self.embedder.embed(content) for content in tqdmk(content_list)] | |
| print("Finished embedding...") | |
| # Create a dataframe to ingest it to the database | |
| dataframe = pd.DataFrame({ | |
| 'page_content': content_list, | |
| 'embeddings': embeddings}) | |
| return dataframe | |
| def make_data_frame(self,path,chunk_size=1000,chunk_overlap=100,is_separator_regex=False): | |
| all_texts = self.read_data(path) | |
| docs = self.text_splitter(all_texts,chunk_size,chunk_overlap,is_separator_regex) | |
| dataframe = self.prepare_text_df(docs) | |
| self.upload_file(dataframe) | |
| return dataframe | |
| def upload_file(self,embedding_df,name='first_aid'): | |
| ''' | |
| Upload data and close database to be commited | |
| ''' | |
| self.conn.make_data_frame(embedding_df,name) | |
| self.conn.close() | |
| def query_relevant(self,user_query): | |
| embedded_query = self.embedder.embed(user_query) | |
| result = self.conn.get_relevant_docs(embedded_query) | |
| return result |