Spaces:

Rehman1603
/

Chat_with_Document

Sleeping

App Files Files Community

Chat_with_Document / app.py

Rehman1603

Update app.py

1f754f7 verified over 1 year ago

raw

history blame contribute delete

5.02 kB

	import streamlit as st
	#from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceInstructEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain.llms import GPT4All, HuggingFaceHub
	from streamlit_chat import message
	from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
	from langchain.document_loaders import PyPDFLoader
	import os

	def extract_text_chunks(doc):
	text = ""
	pdf_reader = PdfReader(doc)
	for page in pdf_reader.pages:
	# read and concatenate the text from
	# each page into the raw text string
	text += page.extract_text()

	return get_text_chunks(text)



	def get_text_chunks(raw_text):
	# return text chunks from the raw text extracted
	# from the user PDFs
	# st.info("Creating text chunks from raw text")
	# initializing text splitter
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=250,
	length_function=len
	)
	# creating text chunks
	chunks = text_splitter.split_text(raw_text)
	# st.success("Text chunks created")
	return chunks


	def handle_user_input(user_question):
	if user_question is None:
	return
	# Handle user Queries
	print(f"Generating response to user query: {user_question}")
	response = st.session_state.conversation({'question': user_question})

	print("Response generated. Updating session state chat_history")
	st.session_state.chat_history = response['chat_history']

	for i, msg in enumerate(st.session_state.chat_history):
	is_user_message = (i%2==0)
	message(msg.content, is_user=is_user_message)

	def process_new_uploads(pdf_docs):
	vectordb = st.session_state.vectorstore
	# vectordb = Chroma(persist_directory="./documents_cache/conversation_retrieval", embedding_function=embeddings)
	for doc in pdf_docs:
	print(f"Processing new file: {doc.name}")

	print(f"Saving original file copy: {doc.name}")
	with open(os.path.join("original_documents",doc.name),"wb") as f:
	f.write(doc.getbuffer())

	loader = PyPDFLoader(file_path=f"./original_documents/{doc.name}")
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	text_chunks = text_splitter.split_documents(loader.load())

	vectordb.add_documents(documents=text_chunks)
	vectordb.persist()



	def load_vector_database():
	embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", cache_folder="./model_cache")
	st.session_state.vectorstore = Chroma(persist_directory="./documents_cache/conversation_retrieval", embedding_function=embeddings)

	def initialize_conversation_chain():
	callbacks = [StreamingStdOutCallbackHandler()]
	# Snoozy Model failed to start on my machine # local_model_snoozy = "./model_cache/ggml-gpt4all-l13b-snoozy.bin"
	local_model_groovy = "./model_cache/ggml-gpt4all-j-v1.3-groovy.bin"
	# HuggingFaceHub(verbose=True,cache=True,task="text-generation",repo_id="tiiuae/falcon-40b-instruct")
	llm_instance = GPT4All(model=local_model_groovy, callbacks=callbacks, verbose=True)
	vectordb = st.session_state.vectorstore
	memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm_instance,
	retriever=vectordb.as_retriever(),
	memory=memory
	)

	st.session_state.conversation = conversation_chain



	def main():
	#load_dotenv()
	st.set_page_config(page_title="Converse with your Documents", page_icon=":books:")

	st.header("Converse with your Documentation..!! :books:")

	if "vectorstore" not in st.session_state:
	with st.spinner("Loading Vector Database..."):
	load_vector_database()

	if "conversation" not in st.session_state:
	with st.spinner("Initializing AI. Please wait..."):
	initialize_conversation_chain()

	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None

	user_question = st.text_input("Ask a question to your documents here:")

	if user_question:
	handle_user_input(user_question)
	user_question = None

	with st.sidebar:
	st.subheader("Your documents")
	pdf_docs = st.file_uploader(
	"Upload your PDFs here and click on 'Process'",
	accept_multiple_files=True
	)
	if st.button("Process"):
	with st.spinner("Processing..."):

	process_new_uploads(pdf_docs)



	if __name__ == "__main__":
	main()