File size: 1,140 Bytes
8d924c8
 
 
 
 
 
 
 
 
 
 
 
 
 
3634e75
 
 
 
 
 
 
 
8d924c8
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
from langchain.document_loaders import PyMuPDFLoader, PyPDFDirectoryLoader
from langchain.embeddings import VoyageEmbeddings, OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import DeepLake
from dotenv import load_dotenv
load_dotenv()

embeddings = VoyageEmbeddings(model="voyage-lite-01", show_progress_bar=True)

def init_vectorstore(dataset_path="hub://p1utoze/default", embeddings="voyage/voyage-lite-01"):
    db = DeepLake(dataset_path=dataset_path, embedding=embeddings)
    return db

def load_documents(file_path=None, base_path="data/INFORMATION-TECHNOLOGY/"):
    if file_path:
        loader = PyMuPDFLoader(file_path)
    else:
        for file in os.listdir(base_path):
            path = base_path + file
            print(path)
            loader = PyMuPDFLoader(path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
    docs = loader.load_and_split(text_splitter)
    db = init_vectorstore("hub://p1utoze/resumes", embeddings)
    db.add_documents(docs)


# print(load_documents())