| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import PyPDFLoader | |
| import pathlib | |
| import gdown | |
| from .chains import PdfAndGoogleChain | |
| def embed_pdf(folder: str = 'data', name: str = 'book.pdf'): | |
| pathlib.Path(folder).mkdir(exist_ok=True) | |
| path = pathlib.Path(folder).joinpath(name) | |
| if not path.exists(): | |
| print('Downloading book PDF.') | |
| gdown.download('https://drive.google.com/file/d/1CwhFM4gInp9xV4G4sdnYE_rN0StmqQ2z/view?usp=drive_link', | |
| str(path), fuzzy=True) | |
| loader = PyPDFLoader(str(path)) | |
| documents = loader.load() | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=100) | |
| return splitter.split_documents( | |
| documents | |
| ) | |
| class AcademicClient: | |
| def create_vectordb(self): | |
| if pathlib.Path('db').exists(): | |
| self.vectordb = Chroma(persist_directory='db', embedding_function=OpenAIEmbeddings()) | |
| elif pathlib.Path('src/db').exists(): | |
| self.vectordb = Chroma(persist_directory='src/db', embedding_function=OpenAIEmbeddings()) | |
| else: | |
| print('Not found cached DB. Rebuilding DB state, could use money from OPENAI!!!!') | |
| texts = embed_pdf() | |
| self.vectordb = Chroma.from_documents( | |
| documents=texts, | |
| embedding=OpenAIEmbeddings(), | |
| persist_directory="db" | |
| ) | |
| self.vectordb.persist() | |
| def __init__(self): | |
| self.create_vectordb() | |
| self.chain = PdfAndGoogleChain( | |
| self.vectordb.as_retriever(search_type="mmr", search_kwargs={"fetch_k": 30, "k": 6})) | |
| def answer(self, query, options): | |
| return self.chain.answer(query, options) | |