Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import TextLoader, PyPDFLoader | |
| from langchain_community.document_loaders import BSHTMLLoader, UnstructuredMarkdownLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS, VectorStore | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_core.documents import Document | |
| from pypdf.errors import PyPdfError | |
| # stdlib | |
| from glob import glob | |
| import pathlib | |
| def load_text(file_path: str) -> list[Document] | None: | |
| """Loads text documents (.txt) asynchronously from a passed file_path.""" | |
| assert file_path != "" | |
| assert pathlib.Path(file_path).suffix == ".txt" | |
| try: | |
| loader = TextLoader(file_path) | |
| return loader.load() | |
| except UnicodeError or RuntimeError as err: | |
| print(f"could not load file: {file_path}") | |
| print(f"error: {err}") | |
| # https://python.langchain.com/docs/how_to/document_loader_markdown/ | |
| def load_markdown(file_path: str) -> list[Document] | None: | |
| """Loads markdown files asynchronously from a passed file_path.""" | |
| assert file_path != "" | |
| assert pathlib.Path(file_path).suffix == ".md" | |
| try: | |
| # use the mode elements to keep metadata about if the information is | |
| # a paragraph, link or a heading for example | |
| loader = UnstructuredMarkdownLoader(file_path, mode="elements") | |
| return loader.load() | |
| except UnicodeError or RuntimeError as err: | |
| print(f"could not load file: {file_path}") | |
| print(f"error: {err}") | |
| # https://python.langchain.com/docs/how_to/document_loader_pdf/ | |
| def load_pdf(file_path: str) -> list[Document] | None: | |
| """Loads pdf documents (.pdf) asynchronously from a passed file_path.""" | |
| assert file_path != "" | |
| assert pathlib.Path(file_path).suffix == ".pdf" | |
| loader = PyPDFLoader(file_path) | |
| try: | |
| return loader.load() | |
| except PyPdfError as err: | |
| print(f"could not read file: {file_path}") | |
| print(f"error: {err}") | |
| def load_html(file_path: str) -> list[Document]: | |
| """Loads html documents (.html) asynchronously from a passed file_path.""" | |
| assert file_path != "" | |
| assert pathlib.Path(file_path).suffix == ".html" or ".htm" | |
| loader = BSHTMLLoader(file_path) | |
| return loader.load() | |
| # hold all of the loader functions for easy 0(1) fetching | |
| LOADER_MAP = { | |
| ".pdf": load_pdf, | |
| ".html": load_html, | |
| ".htm": load_html, | |
| ".txt": load_text, | |
| ".md": load_markdown, | |
| } | |
| # https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/ | |
| def get_document_database( | |
| data_folder="learning_material/*/*/*", | |
| embedding_model="BAAI/bge-base-en-v1.5", | |
| chunk_size=1028, chunk_overlap=0, | |
| ) -> VectorStore: | |
| # get all the filepaths of the learning materials | |
| files = glob(data_folder) | |
| all_docs = [] | |
| for file_path in files: | |
| extension = pathlib.Path(file_path).suffix | |
| if not extension: | |
| print(f"{file_path} is a folder, skipping") | |
| continue | |
| load_fn = LOADER_MAP.get(extension) | |
| if not load_fn: | |
| print(f"no document loader for file extension '{extension}'") | |
| print(f"file {file_path} will be skipped") | |
| continue | |
| # load the document with a filetype specific loader | |
| result_documents = load_fn(file_path) | |
| if not result_documents: | |
| print(f"file {file_path} does not include any content, skipping") | |
| continue | |
| all_docs.extend(result_documents) | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| chunked_docs = splitter.split_documents(all_docs) | |
| return FAISS.from_documents( | |
| chunked_docs, | |
| HuggingFaceEmbeddings(model_name=embedding_model) | |
| ) | |