Spaces:
Build error
Build error
File size: 1,662 Bytes
d895362 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.document_loaders import TextLoader, PyPDFLoader
from typing import Optional
import os
embeddings_model_name ="multi-qa-MiniLM-L6-cos-v1"
persist_directory = "db"
target_source_chunks = 4
openai_api_key = os.environ.get('OPENAI_API_KEY')
#embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
embeddings = SentenceTransformerEmbeddings(model_name=embeddings_model_name)
def load_vectorestore_from_pdf(path:str, embeddings=embeddings, persist:Optional[bool]=True):
loader = PyPDFLoader(path)
documents = loader.load()
#print(len(documents))
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(documents)
#print(len(documents))
if not persist:
vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=None)
return vectorstore
vectorstore = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)
vectorstore.persist()
vectorstore = None
return None
if __name__ == "__main__":
load_vectorestore_from_pdf() |