| from langchain.chains import RetrievalQA | |
| from langchain_community.document_loaders import UnstructuredHTMLLoader | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_openai import ChatOpenAI | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain_community.vectorstores import Chroma | |
| def get_retrieval_qa(filename): | |
| # load documents | |
| loader = UnstructuredHTMLLoader(filename) | |
| documents = loader.load() | |
| # split the documents into chunks | |
| text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| texts = text_splitter.split_documents(documents) | |
| # select which embeddings we want to use | |
| embeddings = OpenAIEmbeddings() | |
| # create the vectorestore to use as the index | |
| db = Chroma.from_documents(texts, embeddings) | |
| # expose this index in a retriever interface | |
| retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2}) | |
| # create a chain to answer questions | |
| return RetrievalQA.from_chain_type( | |
| llm=ChatOpenAI(), | |
| chain_type="stuff", | |
| retriever=retriever, | |
| return_source_documents=True, | |
| verbose=True, | |
| ) | |