File size: 1,140 Bytes
8d924c8 3634e75 8d924c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import os
from langchain.document_loaders import PyMuPDFLoader, PyPDFDirectoryLoader
from langchain.embeddings import VoyageEmbeddings, OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import DeepLake
from dotenv import load_dotenv
load_dotenv()
embeddings = VoyageEmbeddings(model="voyage-lite-01", show_progress_bar=True)
def init_vectorstore(dataset_path="hub://p1utoze/default", embeddings="voyage/voyage-lite-01"):
db = DeepLake(dataset_path=dataset_path, embedding=embeddings)
return db
def load_documents(file_path=None, base_path="data/INFORMATION-TECHNOLOGY/"):
if file_path:
loader = PyMuPDFLoader(file_path)
else:
for file in os.listdir(base_path):
path = base_path + file
print(path)
loader = PyMuPDFLoader(path)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
docs = loader.load_and_split(text_splitter)
db = init_vectorstore("hub://p1utoze/resumes", embeddings)
db.add_documents(docs)
# print(load_documents()) |