File size: 1,662 Bytes
d895362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.document_loaders import TextLoader, PyPDFLoader
from typing import Optional
import os 



embeddings_model_name ="multi-qa-MiniLM-L6-cos-v1"
persist_directory = "db"
target_source_chunks = 4
openai_api_key = os.environ.get('OPENAI_API_KEY')


#embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
embeddings = SentenceTransformerEmbeddings(model_name=embeddings_model_name)


def load_vectorestore_from_pdf(path:str, embeddings=embeddings, persist:Optional[bool]=True):
    
    loader = PyPDFLoader(path)
    documents = loader.load()
    #print(len(documents))

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    documents = text_splitter.split_documents(documents)

    #print(len(documents))

    
    
    if not persist:
        vectorstore =     Chroma.from_documents(documents, embeddings, persist_directory=None)
        return vectorstore
    vectorstore =     Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)
    vectorstore.persist()
    vectorstore = None
    return None
    

if __name__ == "__main__":
    load_vectorestore_from_pdf()