Phi2-PDF-chat

Sleeping

App Files Files Community

Phi2-PDF-chat / app.py

dinhquangson

Update app.py

13bddc0 verified almost 2 years ago

raw

history blame contribute delete

6.25 kB

	"""
	Question Answering with Retrieval QA and LangChain Language Models featuring FAISS vector stores.
	This script uses the LangChain Language Model API to answer questions using Retrieval QA
	and FAISS vector stores. It also uses the Mistral huggingface inference endpoint to
	generate responses.
	"""

	import os
	import streamlit as st
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import UnstructuredPDFLoader
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.chat_models import ChatOpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from htmlTemplates import css, bot_template, user_template
	from langchain.llms import HuggingFaceHub
	#from llama_index.llms import LlamaCPP



	def get_pdf_pages(pdf_docs):
	"""
	Extract text from a list of PDF documents.

	Parameters
	----------
	pdf_docs : list
	List of PDF documents to extract text from.

	Returns
	-------
	str
	Extracted text from all the PDF documents.

	"""
	pages = []
	import tempfile

	with tempfile.TemporaryDirectory() as tmpdirname:
	for pdf in pdf_docs:
	pdf_path=os.path.join(tmpdirname,pdf.name)
	with open(pdf_path, "wb") as f:
	f.write(pdf.getbuffer())

	pdf_loader = UnstructuredPDFLoader(pdf_path)
	pdf_pages = pdf_loader.load_and_split()
	pages=pages+pdf_pages
	return pages


	def get_text_chunks(pages):
	"""
	Split the input text into chunks.

	Parameters
	----------
	text : str
	The input text to be split.

	Returns
	-------
	list
	List of text chunks.

	"""
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1024, chunk_overlap=64
	)
	texts = text_splitter.split_documents(pages)
	print(str(len(texts)))
	return texts


	def get_vectorstore(text_chunks):
	"""
	Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.

	Parameters
	----------
	text_chunks : list
	List of text chunks to be embedded.

	Returns
	-------
	FAISS
	A FAISS vector store containing the embeddings of the text chunks.

	"""
	MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
	hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
	vectorstore = Chroma.from_documents(text_chunks, hf_embeddings, persist_directory="db")
	return vectorstore


	def get_conversation_chain(vectorstore):
	"""
	Create a conversational retrieval chain using a vector store and a language model.

	Parameters
	----------
	vectorstore : FAISS
	A FAISS vector store containing the embeddings of the text chunks.

	Returns
	-------
	ConversationalRetrievalChain
	A conversational retrieval chain for generating responses.

	"""
	llm = HuggingFaceHub(
	repo_id="TheBloke/phi-2-GGUF",
	model_kwargs={"temperature": 0.5, "max_new_tokens": 1024, "max_length": 1048, "top_k": 3, "trust_remote_code": True, "torch_dtype": "auto"},
	)

	# llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

	memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm, retriever=vectorstore.as_retriever(), memory=memory
	)
	return conversation_chain


	def handle_userinput(user_question):
	"""
	Handle user input and generate a response using the conversational retrieval chain.
	Parameters
	----------
	user_question : str
	The user's question.
	"""
	response = st.session_state.conversation({"question": user_question})
	st.session_state.chat_history = response["chat_history"]

	for i, message in enumerate(st.session_state.chat_history):
	if i % 2 == 0:
	st.write("//_^ User: " + message.content)
	else:
	st.write("🤖 ChatBot: " + message.content)


	def main():
	"""
	Putting it all together.
	"""
	st.set_page_config(
	page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
	page_icon=":books:",
	)

	st.markdown("# Chat with a Bot")
	st.markdown("This bot tries to answer questions about multiple PDFs. Let the processing of the PDF finish before adding your question. 🙏🏾")

	st.write(css, unsafe_allow_html=True)

	# set huggingface hub token in st.text_input widget
	# then hide the input
	huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password", value="DNTClESFouRJbgsoxTzdLFzYfIlGSVsWvM")
	#openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
	if not huggingface_token.startswith("hf_"):
	huggingface_token = "hf_" + huggingface_token
	# set this key as an environment variable
	os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
	#os.environ["OPENAI_API_KEY"] = openai_api_key

	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None

	with st.sidebar:
	st.subheader("Your documents")
	pdf_docs = st.file_uploader(
	"Upload your PDFs here and click on 'Process'", accept_multiple_files=True
	)
	if st.button("Process"):
	with st.spinner("Processing"):
	# get the raw text
	pages = get_pdf_pages(pdf_docs)

	# get the text chunks
	text_chunks = get_text_chunks(pages)

	# create vector store
	vectorstore = get_vectorstore(text_chunks)

	# create conversation chain
	st.session_state.conversation = get_conversation_chain(vectorstore)
	print(st.session_state.conversation)

	st.header("Chat with a Bot 🤖🦾 that tries to answer questions about multiple PDFs :books:")
	user_question = st.text_input("Ask a question about your documents:")
	if user_question:
	handle_userinput(user_question)


	if __name__ == "__main__":
	main()