WhatsApp_FAQ_AI_Chatbot / vectorize_documents.py
Krish30's picture
Upload 4 files
2d12c4f verified
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
# Define a function to perform vectorization
def vectorize_documents():
# Loading the embedding model
embeddings = HuggingFaceEmbeddings()
loader = DirectoryLoader(
path="Data",
glob="./*.pdf",
loader_cls=UnstructuredFileLoader
)
documents = loader.load()
# Splitting the text and creating chunks of these documents.
text_splitter = CharacterTextSplitter(
chunk_size=2000,
chunk_overlap=500
)
text_chunks = text_splitter.split_documents(documents)
# Store in Chroma vector DB
vectordb = Chroma.from_documents(
documents=text_chunks,
embedding=embeddings,
persist_directory="vector_db_dir"
)
print("Documents Vectorized and saved in VectorDB")
# Expose embeddings if needed
embeddings = HuggingFaceEmbeddings()
# Main guard to prevent execution on import
if __name__ == "__main__":
vectorize_documents()