""" Created By: ishwor subedi Date: 2024-08-23 """ import string from uuid import uuid4 from langchain.docstore.document import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from src.services.vector_db.qdrent.upload_document import upload_document_existing_collection class AddDocument: def __init__(self, vector_embedding, sparse_embedding): self.vector_embed = vector_embedding self.sparse_embed = sparse_embedding def add_documents(self, texts: list[tuple[str]], vectorstore: str): splitter = RecursiveCharacterTextSplitter( chunk_size=400, chunk_overlap=100, add_start_index=True ) sources = [textTuple[1] for textTuple in texts] texts = [textTuple[0].replace("\n", " ") for textTuple in texts] texts = [text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) for text in texts] texts = [Document(page_content=text, metadata={"source": source}) for text, source in zip(texts, sources)] documents = splitter.split_documents(texts) upload_document_existing_collection(vector_embed=self.vector_embed, sparse_embed=self.sparse_embed, vectorstore=vectorstore, documents=documents)