Spaces:
Sleeping
Sleeping
| ''' | |
| This module contains all the loaders | |
| ''' | |
| import os | |
| from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain.chains import RetrievalQA | |
| from langchain.chat_models import ChatOpenAI | |
| from constants import TEMPERATURE, MODEL_NAME | |
| openai_api_key=os.environ['OPENAI_API_KEY'] | |
| def load_pdf(path: str = "resume.pdf"): | |
| ''' | |
| Load a pdf file from a stringio object | |
| ''' | |
| pdf_loader = PyPDFLoader(path) | |
| documents = pdf_loader.load() | |
| return documents | |
| def load_multiple_documents(path: str = "documents"): | |
| ''' | |
| Load multiple documents from a folder | |
| ''' | |
| documents = [] | |
| for file in os.listdir(path): | |
| if file.endswith('.pdf'): | |
| pdf_path = './documents/' + file | |
| loader = PyPDFLoader(pdf_path) | |
| documents.extend(loader.load()) | |
| elif file.endswith('.docx') or file.endswith('.doc'): | |
| doc_path = './documents/' + file | |
| loader = Docx2txtLoader(doc_path) | |
| documents.extend(loader.load()) | |
| elif file.endswith('.txt'): | |
| text_path = './documents/' + file | |
| loader = TextLoader(text_path) | |
| documents.extend(loader.load()) | |
| return documents | |
| def get_embeddings(documents): | |
| ''' | |
| Get embeddings from a list of documents | |
| ''' | |
| splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) | |
| texts = splitter.split_documents(documents) | |
| embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) | |
| return texts, embeddings | |
| def get_db(texts, embeddings): | |
| ''' | |
| Get a vectorstore from a list of texts and embeddings | |
| ''' | |
| db = Chroma.from_documents(texts, embeddings) | |
| return db | |
| def get_retriever(db): | |
| ''' | |
| Get a retriever from a vectorstore | |
| ''' | |
| retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":1}) | |
| return retriever | |
| def get_chain_for_pdf(path): | |
| ''' | |
| Get a conversation chain from a path | |
| ''' | |
| documents = load_multiple_documents(path) | |
| texts, embeddings = get_embeddings(documents) | |
| db = get_db(texts, embeddings) | |
| retriever = get_retriever(db) | |
| chain = RetrievalQA.from_chain_type( | |
| llm=ChatOpenAI(temperature=TEMPERATURE, openai_api_key=openai_api_key, model=MODEL_NAME), | |
| chain_type="stuff", | |
| retriever=retriever, | |
| return_source_documents=True) | |
| return chain | |