Spaces:

Libidrave
/

LLM-RAG

Sleeping

App Files Files Community

Libidrave commited on Feb 19, 2025

Commit

c8d159b

verified ·

1 Parent(s): 1a2db48

Create utils/preprocess.py

Browse files

Files changed (1) hide show

utils/preprocess.py +78 -0

utils/preprocess.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from langchain_community.document_loaders.generic import GenericLoader
+from langchain_community.document_loaders import FileSystemBlobLoader
+from langchain_community.document_loaders.parsers import PyMuPDFParser
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_chroma import Chroma
+from pathlib import Path
+def load_data(documents):
+    """
+    Load and parse data from a list of PDF files.
+    Args:
+    documents Union[UploadedFile, list(UploadedFile)]: A single UploadedFile or list of UploadedFile objects. Strict for PDF only.
+    Returns:
+    List[Document]: A list of parsed LangChain Document class.
+    """
+    # Write PDF file to current working directory
+    for file in documents:
+        with open(f"./{file.name}", 'wb') as f:
+            f.write(file.getbuffer())
+    # Load and parse the data
+    loader = GenericLoader(blob_loader=FileSystemBlobLoader(path="./", glob="*.pdf"),
+                           blob_parser=PyMuPDFParser(mode='page'))
+    loaded_docs = loader.load()
+    # Remove temporary PDF files after loading
+    pdf_files = Path.cwd().glob("*.pdf")
+    for pdf in pdf_files:
+        pdf.unlink()
+    return loaded_docs
+def split_data(loaded_docs):
+    """
+    Split a list of loaded documents into smaller chunks.
+    Args:
+    loaded_docs List[Document]: A list of loaded LangChain Document class.
+    Returns:
+    List[Document]: A list of smaller chunks of parsed document.
+    """
+    splitter = RecursiveCharacterTextSplitter(
+                        separators=["\n\n", "\n", " ", ".", ",", ""
+                                    "\u200b",  # Zero-width space
+                                    "\uff0c",  # Fullwidth comma
+                                    "\u3001",  # Ideographic comma
+                                    "\uff0e",  # Fullwidth full stop
+                                    "\u3002",  # Ideographic full stop
+                                    ],
+                        chunk_size=1000,
+                        chunk_overlap=200,
+                        add_start_index=True,
+                        is_separator_regex=False)
+    splitted_docs = splitter.split_documents(loaded_docs)
+    return splitted_docs
+def upsert_chromadb(splitted_docs, embedding, idx, collection_name, db_name):
+    """
+    Upserts data into Chromadb
+    Args:
+    splitted_docs List[Document]: A list of smaller chunks of parsed document.
+    embedding: The embedding model.
+    idx List[str]: A list of unique identifiers for each document.
+    collection_name str: The name of the Chroma collection.
+    db_name str: The name of the database.
+    """
+    vector_store = Chroma.from_documents(splitted_docs, embedding, ids=idx,
+                          collection_name=collection_name,
+                          persist_directory="./" + db_name
+                         )
+    return vector_store