File size: 3,037 Bytes
b80958f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
## use OS to read all files in the folder output is a list
## run through each file name:
### send to ingest, using pymupdfloader, use alazyloader to read file
### send to chunk function extract chunks with overlap of 300
## merge all chunks of document into a single list
## proceed to embedding phase
## save all into FAISS vector store
### Resources: https://stackabuse.com/python-async-await-tutorial/
import os
import asyncio
import itertools
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
FILE_PATH = "./data/"
VECTORSTORE = "./vectorstore/agriquery_faiss_index"
# initialise the recursive method
splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=50
)
# ingest data function. We use async here to allow for asynchronuous/continual processing
async def ingest(file_name, path):
"""
loads content of file using pymupdf
input (str): file names and file path
output (list): file content divided by pages
"""
pages = []
loader = PyMuPDFLoader(path + file_name)
async for page in loader.alazy_load():
pages.append(page)
return pages
# chunk function
def chunk(file_content):
"""
chunks content of file using langchain recursive splitter
input (list): file content divided by pages
output (list): chunks with overlaps defined
"""
chunks = []
for page in file_content:
docs = [Document(page_content=page.page_content)]
texts = splitter.split_documents(docs)
chunks.append(texts)
return list(itertools.chain(*chunks))
# embed funtion and store to FAISS store
def embed(chunks):
"""
embed the chunks using hugging face sentence transformer
input (list): chunks
output (list): list of vectors
"""
embeddings = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(chunks, embeddings)
db.save_local(VECTORSTORE)
# main function
async def main():
"""
main function that runs as file is called
"""
total_chunks = []
files = os.listdir(FILE_PATH)
task = [ingest(file_name, FILE_PATH) for file_name in files]
page_list = await asyncio.gather(*task)
for pages in page_list:
# pages = await ingest(file_name, FILE_PATH)
# print(f"Total length of pages for {file_name} is: {len(pages)}")
# call the chunk function
chunks = chunk(pages)
# print(len(chunks))
total_chunks.append(chunks)
# flatten the list of lists, make it suitable for embedding
chunks = list(itertools.chain(*total_chunks))
print(f"Total length of chunks is: {len(chunks)}")
embed(chunks)
print("Success!")
if __name__ == "__main__":
asyncio.run(main()) |