| """ | |
| Created By: ishwor subedi | |
| Date: 2024-08-23 | |
| """ | |
| import string | |
| from uuid import uuid4 | |
| from langchain.docstore.document import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from src.services.vector_db.qdrent.upload_document import upload_document_existing_collection | |
| class AddDocument: | |
| def __init__(self, vector_embedding, sparse_embedding): | |
| self.vector_embed = vector_embedding | |
| self.sparse_embed = sparse_embedding | |
| def add_documents(self, texts: list[tuple[str]], vectorstore: str): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=400, | |
| chunk_overlap=100, | |
| add_start_index=True | |
| ) | |
| sources = [textTuple[1] for textTuple in texts] | |
| texts = [textTuple[0].replace("\n", " ") for textTuple in texts] | |
| texts = [text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) for text in texts] | |
| texts = [Document(page_content=text, metadata={"source": source}) for text, source in zip(texts, sources)] | |
| documents = splitter.split_documents(texts) | |
| upload_document_existing_collection(vector_embed=self.vector_embed, | |
| sparse_embed=self.sparse_embed, | |
| vectorstore=vectorstore, documents=documents) | |