| from langchain_community.document_loaders import UnstructuredFileLoader | |
| from langchain_community.document_loaders import DirectoryLoader | |
| from langchain_text_splitters import CharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| # # Define a function to perform vectorization | |
| def vectorize_documents(): | |
| embeddings = HuggingFaceEmbeddings() | |
| loader = DirectoryLoader( | |
| path="Data_1", | |
| glob="./*.pdf", | |
| loader_cls=UnstructuredFileLoader | |
| ) | |
| documents = loader.load() | |
| # Splitting the text and creating chunks of these documents. | |
| text_splitter = CharacterTextSplitter( | |
| chunk_size=2000, | |
| chunk_overlap=500 | |
| ) | |
| text_chunks = text_splitter.split_documents(documents) | |
| # Store in Chroma vector DB | |
| vectordb = Chroma.from_documents( | |
| documents=text_chunks, | |
| embedding=embeddings, | |
| persist_directory="vector_db_dir_notes_ai" | |
| ) | |
| print("Documents Vectorized and saved in VectorDB") | |
| # Expose embeddings if needed | |
| embeddings = HuggingFaceEmbeddings() | |
| # Main guard to prevent execution on import | |
| if __name__ == "__main__": | |
| vectorize_documents() | |
| # # Define a function to perform vectorization | |
| # def vectorize_documents(): | |
| # # Loading the embedding model | |
| # embeddings = HuggingFaceEmbeddings() | |
| # loader = DirectoryLoader( | |
| # path="Data", | |
| # glob="./*.pdf", | |
| # loader_cls=UnstructuredFileLoader | |
| # ) | |
| # documents = loader.load() | |
| # # Splitting the text and creating chunks of these documents. | |
| # text_splitter = CharacterTextSplitter( | |
| # chunk_size=2000, | |
| # chunk_overlap=500 | |
| # ) | |
| # text_chunks = text_splitter.split_documents(documents) | |
| # # Store in Chroma vector DB | |
| # vectordb = Chroma.from_documents( | |
| # documents=text_chunks, | |
| # embedding=embeddings, | |
| # persist_directory="vector_db_dir" | |
| # ) | |
| # print("Documents Vectorized and saved in VectorDB") | |
| # # Expose embeddings if needed | |
| # embeddings = HuggingFaceEmbeddings() | |
| # # Main guard to prevent execution on import | |
| # if __name__ == "__main__": | |
| # vectorize_documents() |