| import os | |
| from langchain.document_loaders import PyMuPDFLoader, PyPDFDirectoryLoader | |
| from langchain.embeddings import VoyageEmbeddings, OpenAIEmbeddings, HuggingFaceInstructEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import DeepLake | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| embeddings = VoyageEmbeddings(model="voyage-lite-01", show_progress_bar=True) | |
| def init_vectorstore(dataset_path="hub://p1utoze/default", embeddings="voyage/voyage-lite-01"): | |
| db = DeepLake(dataset_path=dataset_path, embedding=embeddings) | |
| return db | |
| def load_documents(file_path=None, base_path="data/INFORMATION-TECHNOLOGY/"): | |
| if file_path: | |
| loader = PyMuPDFLoader(file_path) | |
| else: | |
| for file in os.listdir(base_path): | |
| path = base_path + file | |
| print(path) | |
| loader = PyMuPDFLoader(path) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50) | |
| docs = loader.load_and_split(text_splitter) | |
| db = init_vectorstore("hub://p1utoze/resumes", embeddings) | |
| db.add_documents(docs) | |
| # print(load_documents()) |