jorpier / src /services /document /add_document.py
puzan789's picture
add:updated
0870bc8
"""
Created By: ishwor subedi
Date: 2024-08-23
"""
import string
from uuid import uuid4
from langchain.docstore.document import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from src.services.vector_db.qdrent.upload_document import upload_document_existing_collection
class AddDocument:
def __init__(self, vector_embedding, sparse_embedding):
self.vector_embed = vector_embedding
self.sparse_embed = sparse_embedding
def add_documents(self, texts: list[tuple[str]], vectorstore: str):
splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=100,
add_start_index=True
)
sources = [textTuple[1] for textTuple in texts]
texts = [textTuple[0].replace("\n", " ") for textTuple in texts]
texts = [text.translate(str.maketrans('', '', string.punctuation.replace(".", ""))) for text in texts]
texts = [Document(page_content=text, metadata={"source": source}) for text, source in zip(texts, sources)]
documents = splitter.split_documents(texts)
upload_document_existing_collection(vector_embed=self.vector_embed,
sparse_embed=self.sparse_embed,
vectorstore=vectorstore, documents=documents)