File size: 1,822 Bytes
64d661c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_postgres.vectorstores import PGVector

 # Loading the embedding model
embeddings = HuggingFaceEmbeddings()

# Define a function to perform vectorization
def vectorize_documents():
    try:
        # Loading the embedding model
       

        loader = DirectoryLoader(
            path="Data",
            glob="./*.pdf",
            loader_cls=UnstructuredFileLoader
        )

        documents = loader.load()

        if not documents:
            print("No documents found in the specified directory.")
            return

        # Splitting the text and creating chunks of these documents.
        text_splitter = CharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=500
        )

        text_chunks = text_splitter.split_documents(documents)

        # Storing in PostgreSQL - PGVector
        connection_string = "postgresql+psycopg2://postgres:krishna23@localhost:5432/vector_db"

        collection_name = "whatsapp_chatbot"
        
        # Create a PGVector instance and store the documents
        vector_store = PGVector.from_documents(
            embedding=embeddings,
            documents=text_chunks,
            collection_name=collection_name,
            connection=connection_string,
        )
        
        print("Documents vectorized successfully and stored in PGVector.")

    except Exception as e:
        print(f"An error occurred: {e}")

# Main guard to prevent execution on import
if __name__ == "__main__":
    vectorize_documents()