Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PagedPDFSplitter | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings.openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| import os | |
| def embed_document(file_name, file_folder="pdf", embedding_folder="index"): | |
| file_path = f"{file_folder}/{file_name}" | |
| loader = PagedPDFSplitter(file_path) | |
| source_pages = loader.load_and_split() | |
| embedding_func = OpenAIEmbeddings() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=100, | |
| length_function=len, | |
| is_separator_regex=False, | |
| separators=["\n\n", "\n", " ", ""], | |
| ) | |
| source_chunks = text_splitter.split_documents(source_pages) | |
| search_index = FAISS.from_documents(source_chunks, embedding_func) | |
| search_index.save_local( | |
| folder_path=embedding_folder, index_name=file_name + ".index" | |
| ) | |
| def embed_all_inputed_pdf_docs(uploaded_document): | |
| # Define the directory path | |
| pdf_directory = "pdf" | |
| pdf_file_path = os.path.join(pdf_directory, uploaded_document.name) | |
| with open(pdf_file_path, 'wb') as file: | |
| file.write(uploaded_document.getbuffer()) | |
| # Check if the directory exists | |
| if os.path.exists(pdf_directory): | |
| # List all PDF files in the directory | |
| pdf_files = [ | |
| file for file in os.listdir(pdf_directory) if file.endswith(".pdf") | |
| ] | |
| if pdf_files: | |
| for pdf_file in pdf_files: | |
| print(f"Embedding {pdf_file}...") | |
| embed_document(file_name=pdf_file, file_folder=pdf_directory) | |
| print("Done!") | |
| else: | |
| raise Exception("No PDF files found in the directory.") | |
| else: | |
| raise Exception(f"Directory '{pdf_directory}' does not exist.") | |
| def embed_all_pdf_docs(): | |
| # Define the directory path | |
| pdf_directory = "pdf" | |
| # Check if the directory exists | |
| if os.path.exists(pdf_directory): | |
| # List all PDF files in the directory | |
| pdf_files = [ | |
| file for file in os.listdir(pdf_directory) if file.endswith(".pdf") | |
| ] | |
| if pdf_files: | |
| for pdf_file in pdf_files: | |
| print(f"Embedding {pdf_file}...") | |
| embed_document(file_name=pdf_file, file_folder=pdf_directory) | |
| print("Done!") | |
| else: | |
| raise Exception("No PDF files found in the directory.") | |
| else: | |
| raise Exception(f"Directory '{pdf_directory}' does not exist.") | |
| def get_all_index_files(): | |
| # Define the directory path | |
| index_directory = "index" | |
| # Check if the directory exists | |
| if os.path.exists(index_directory): | |
| # List all index files in the directory | |
| postfix = ".index.faiss" | |
| index_files = [ | |
| file.replace(postfix, "") | |
| for file in os.listdir(index_directory) | |
| if file.endswith(postfix) | |
| ] | |
| if index_files: | |
| return index_files | |
| else: | |
| raise Exception("No index files found in the directory.") | |
| else: | |
| raise Exception(f"Directory '{index_directory}' does not exist.") | |