|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import asyncio |
|
|
import itertools |
|
|
from langchain_community.document_loaders import PyMuPDFLoader |
|
|
from langchain_text_splitters.character import RecursiveCharacterTextSplitter |
|
|
from langchain_core.documents import Document |
|
|
from langchain_community.embeddings import HuggingFaceBgeEmbeddings |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain.docstore.document import Document |
|
|
|
|
|
|
|
|
FILE_PATH = "./data/" |
|
|
VECTORSTORE = "./vectorstore/agriquery_faiss_index" |
|
|
|
|
|
|
|
|
splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=300, |
|
|
chunk_overlap=50 |
|
|
) |
|
|
|
|
|
|
|
|
async def ingest(file_name, path): |
|
|
""" |
|
|
loads content of file using pymupdf |
|
|
input (str): file names and file path |
|
|
output (list): file content divided by pages |
|
|
""" |
|
|
pages = [] |
|
|
loader = PyMuPDFLoader(path + file_name) |
|
|
|
|
|
async for page in loader.alazy_load(): |
|
|
pages.append(page) |
|
|
|
|
|
return pages |
|
|
|
|
|
|
|
|
|
|
|
def chunk(file_content): |
|
|
""" |
|
|
chunks content of file using langchain recursive splitter |
|
|
input (list): file content divided by pages |
|
|
output (list): chunks with overlaps defined |
|
|
""" |
|
|
chunks = [] |
|
|
|
|
|
for page in file_content: |
|
|
docs = [Document(page_content=page.page_content)] |
|
|
texts = splitter.split_documents(docs) |
|
|
chunks.append(texts) |
|
|
|
|
|
return list(itertools.chain(*chunks)) |
|
|
|
|
|
|
|
|
|
|
|
def embed(chunks): |
|
|
""" |
|
|
embed the chunks using hugging face sentence transformer |
|
|
input (list): chunks |
|
|
output (list): list of vectors |
|
|
""" |
|
|
|
|
|
embeddings = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2") |
|
|
db = FAISS.from_documents(chunks, embeddings) |
|
|
db.save_local(VECTORSTORE) |
|
|
|
|
|
|
|
|
|
|
|
async def main(): |
|
|
""" |
|
|
main function that runs as file is called |
|
|
""" |
|
|
total_chunks = [] |
|
|
files = os.listdir(FILE_PATH) |
|
|
task = [ingest(file_name, FILE_PATH) for file_name in files] |
|
|
page_list = await asyncio.gather(*task) |
|
|
for pages in page_list: |
|
|
|
|
|
|
|
|
|
|
|
chunks = chunk(pages) |
|
|
|
|
|
total_chunks.append(chunks) |
|
|
|
|
|
|
|
|
chunks = list(itertools.chain(*total_chunks)) |
|
|
print(f"Total length of chunks is: {len(chunks)}") |
|
|
embed(chunks) |
|
|
|
|
|
print("Success!") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
asyncio.run(main()) |