check_modules / load_db.py
reyemhorts's picture
fixed dependency
174ce28
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.document_loaders import TextLoader, PyPDFLoader
from typing import Optional
import os
embeddings_model_name ="multi-qa-MiniLM-L6-cos-v1"
persist_directory = "db"
target_source_chunks = 4
openai_api_key = os.environ.get('OPENAI_API_KEY')
#embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
embeddings = SentenceTransformerEmbeddings(model_name=embeddings_model_name)
def load_vectorestore_from_pdf(path:str, embeddings=embeddings, persist:Optional[bool]=True):
loader = PyPDFLoader(path)
documents = loader.load()
#print(len(documents))
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(documents)
#print(len(documents))
if not persist:
vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=None)
return vectorstore
vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)
vectorstore.persist()
vectorstore = None
return None
if __name__ == "__main__":
load_vectorestore_from_pdf()