Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| import gradio as gr | |
| # Load and split PDF document | |
| def load_doc(list_file_path): | |
| # Processing for one document only | |
| # loader = PyPDFLoader(file_path) | |
| # pages = loader.load() | |
| loaders = [PyPDFLoader(x) for x in list_file_path] | |
| pages = [] | |
| for loader in loaders: | |
| pages.extend(loader.load()) | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size = 1024, | |
| chunk_overlap = 64 | |
| ) | |
| doc_splits = text_splitter.split_documents(pages) | |
| return doc_splits | |
| def create_db(splits): | |
| model_kwargs = {'device': 'cpu'} | |
| embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en", model_kwargs =model_kwargs) | |
| vectordb = FAISS.from_documents(splits, embeddings) | |
| return vectordb | |
| def initialize_database(list_file_obj, progress=gr.Progress()): | |
| # Create a list of documents (when valid) | |
| list_file_path = [x.name for x in list_file_obj if x is not None] | |
| # Load document and create splits | |
| doc_splits = load_doc(list_file_path) | |
| # Create or load vector database | |
| vector_db = create_db(doc_splits) | |
| return vector_db #, "Database created!" | |