Spaces:
Sleeping
Sleeping
File size: 1,822 Bytes
64d661c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_postgres.vectorstores import PGVector
# Loading the embedding model
embeddings = HuggingFaceEmbeddings()
# Define a function to perform vectorization
def vectorize_documents():
try:
# Loading the embedding model
loader = DirectoryLoader(
path="Data",
glob="./*.pdf",
loader_cls=UnstructuredFileLoader
)
documents = loader.load()
if not documents:
print("No documents found in the specified directory.")
return
# Splitting the text and creating chunks of these documents.
text_splitter = CharacterTextSplitter(
chunk_size=2000,
chunk_overlap=500
)
text_chunks = text_splitter.split_documents(documents)
# Storing in PostgreSQL - PGVector
connection_string = "postgresql+psycopg2://postgres:krishna23@localhost:5432/vector_db"
collection_name = "whatsapp_chatbot"
# Create a PGVector instance and store the documents
vector_store = PGVector.from_documents(
embedding=embeddings,
documents=text_chunks,
collection_name=collection_name,
connection=connection_string,
)
print("Documents vectorized successfully and stored in PGVector.")
except Exception as e:
print(f"An error occurred: {e}")
# Main guard to prevent execution on import
if __name__ == "__main__":
vectorize_documents()
|