from langchain_community.document_loaders import UnstructuredFileLoader from langchain_community.document_loaders import DirectoryLoader from langchain_text_splitters import CharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_postgres.vectorstores import PGVector # Loading the embedding model embeddings = HuggingFaceEmbeddings() # Define a function to perform vectorization def vectorize_documents(): try: # Loading the embedding model loader = DirectoryLoader( path="Data", glob="./*.pdf", loader_cls=UnstructuredFileLoader ) documents = loader.load() if not documents: print("No documents found in the specified directory.") return # Splitting the text and creating chunks of these documents. text_splitter = CharacterTextSplitter( chunk_size=2000, chunk_overlap=500 ) text_chunks = text_splitter.split_documents(documents) # Storing in PostgreSQL - PGVector connection_string = "postgresql+psycopg2://postgres:krishna23@localhost:5432/vector_db" collection_name = "whatsapp_chatbot" # Create a PGVector instance and store the documents vector_store = PGVector.from_documents( embedding=embeddings, documents=text_chunks, collection_name=collection_name, connection=connection_string, ) print("Documents vectorized successfully and stored in PGVector.") except Exception as e: print(f"An error occurred: {e}") # Main guard to prevent execution on import if __name__ == "__main__": vectorize_documents()