ujaganna commited on
Commit
2d692c0
·
verified ·
1 Parent(s): 70a0ea7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -77
app.py CHANGED
@@ -1,15 +1,22 @@
 
1
  import streamlit as st
2
- from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
- from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
- from langchain.vectorstores import FAISS
7
- from langchain.chat_models import ChatOpenAI
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- from htmlTemplates import css, bot_template, user_template
11
- from langchain.llms import HuggingFaceHub
 
 
 
 
 
 
12
 
 
13
  def get_pdf_text(pdf_docs):
14
  text = ""
15
  for pdf in pdf_docs:
@@ -18,87 +25,84 @@ def get_pdf_text(pdf_docs):
18
  text += page.extract_text()
19
  return text
20
 
21
-
22
  def get_text_chunks(text):
23
- text_splitter = CharacterTextSplitter(
24
- separator="\n",
25
- chunk_size=1000,
26
- chunk_overlap=200,
27
- length_function=len
28
- )
29
  chunks = text_splitter.split_text(text)
30
  return chunks
31
 
32
-
33
- def get_vectorstore(text_chunks):
34
- embeddings = OpenAIEmbeddings()
35
- # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
36
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
37
- return vectorstore
38
-
39
-
40
- def get_conversation_chain(vectorstore):
41
- llm = ChatOpenAI()
42
- # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
43
-
44
- memory = ConversationBufferMemory(
45
- memory_key='chat_history', return_messages=True)
46
- conversation_chain = ConversationalRetrievalChain.from_llm(
47
- llm=llm,
48
- retriever=vectorstore.as_retriever(),
49
- memory=memory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  )
51
- return conversation_chain
52
 
 
53
 
54
- def handle_userinput(user_question):
55
- response = st.session_state.conversation({'question': user_question})
56
- st.session_state.chat_history = response['chat_history']
57
-
58
- for i, message in enumerate(st.session_state.chat_history):
59
- if i % 2 == 0:
60
- st.write(user_template.replace(
61
- "{{MSG}}", message.content), unsafe_allow_html=True)
62
- else:
63
- st.write(bot_template.replace(
64
- "{{MSG}}", message.content), unsafe_allow_html=True)
65
 
 
66
 
67
- def main():
68
- load_dotenv()
69
- st.set_page_config(page_title="Chat with multiple PDFs",
70
- page_icon=":books:")
71
- st.write(css, unsafe_allow_html=True)
72
-
73
- if "conversation" not in st.session_state:
74
- st.session_state.conversation = None
75
- if "chat_history" not in st.session_state:
76
- st.session_state.chat_history = None
77
-
78
- st.header("Chat with multiple PDFs :books:")
79
- user_question = st.text_input("Ask a question about your documents:")
80
  if user_question:
81
- handle_userinput(user_question)
82
 
83
  with st.sidebar:
84
- st.subheader("Your documents")
85
- pdf_docs = st.file_uploader(
86
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
87
- if st.button("Process"):
88
- with st.spinner("Processing"):
89
- # get pdf text
90
- raw_text = get_pdf_text(pdf_docs)
91
 
92
- # get the text chunks
 
 
93
  text_chunks = get_text_chunks(raw_text)
 
 
94
 
95
- # create vector store
96
- vectorstore = get_vectorstore(text_chunks)
97
-
98
- # create conversation chain
99
- st.session_state.conversation = get_conversation_chain(
100
- vectorstore)
101
-
102
-
103
- if __name__ == '__main__':
104
  main()
 
1
+ # Imports
2
  import streamlit as st
 
3
  from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ import os
6
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
7
+ import google.generativeai as genai
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain_google_genai import ChatGoogleGenerativeAI
10
+ from langchain.chains.question_answering import load_qa_chain
11
+ from langchain.prompts import PromptTemplate
12
+ from dotenv import load_dotenv
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+ os.getenv("GOOGLE_API_KEY")
17
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) # Configure Google Generative AI
18
 
19
+ # Extracts text from all pages of provided PDF documents
20
  def get_pdf_text(pdf_docs):
21
  text = ""
22
  for pdf in pdf_docs:
 
25
  text += page.extract_text()
26
  return text
27
 
28
+ # Splits text into chunks of 10,000 characters with 1,000 character overlap
29
  def get_text_chunks(text):
30
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
 
 
 
 
 
31
  chunks = text_splitter.split_text(text)
32
  return chunks
33
 
34
+ # Creates and saves a FAISS vector store from text chunks
35
+ def get_vector_store(text_chunks):
36
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
37
+ vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
38
+ vector_store.save_local("faiss_index")
39
+
40
+ # Creates and returns a conversational chain for question answering
41
+ def get_conversational_chain():
42
+ prompt_template = """
43
+ Answer the question concisely, focusing on the most relevant and important details from the PDF context.
44
+ Refrain from mentioning any mathematical equations, even if they are present in provided context.
45
+ Focus on the textual information available. Please provide direct quotations or references from PDF
46
+ to back up your response. If the answer is not found within the PDF,
47
+ please state "answer is not available in the context."\n\n
48
+ Context:\n {context}?\n
49
+ Question: \n{question}\n
50
+ Example response format:
51
+ Overview:
52
+ (brief summary or introduction)
53
+ Key points:
54
+ (point 1: paragraph for key details)
55
+ (point 2: paragraph for key details)
56
+ ...
57
+ Use a mix of paragraphs and points to effectively convey the information.
58
+ """
59
+
60
+ # Adjust temperature parameter to lower value to:
61
+ # reduce model creativity & focus on factual accuracy
62
+ model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.2)
63
+
64
+ prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
65
+ chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
66
+
67
+ return chain
68
+
69
+ # Processes user question and provides a response
70
+ def user_input(user_question):
71
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
72
+
73
+ new_db = FAISS.load_local("faiss_index", embeddings)
74
+ docs = new_db.similarity_search(user_question)
75
+
76
+ chain = get_conversational_chain()
77
+
78
+ response = chain.invoke(
79
+ {"input_documents": docs, "question": user_question},
80
+ return_only_outputs=True
81
  )
 
82
 
83
+ st.write("Reply: ", response["output_text"],"")
84
 
85
+ # Streamlit UI
86
+ def main():
87
+ st.set_page_config(page_title="Chat with PDFs", page_icon="")
88
+ st.header("Chat with multiple PDFs using AI 💬")
 
 
 
 
 
 
 
89
 
90
+ user_question = st.text_input("Ask a Question from PDF file(s)")
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  if user_question:
93
+ user_input(user_question)
94
 
95
  with st.sidebar:
96
+ st.title("Menu ")
97
+ pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button ",
98
+ accept_multiple_files=True)
 
 
 
 
99
 
100
+ if st.button("Submit & Process"):
101
+ with st.spinner("Processing..."):
102
+ raw_text = get_pdf_text(pdf_docs)
103
  text_chunks = get_text_chunks(raw_text)
104
+ get_vector_store(text_chunks)
105
+ st.success("Done ✨")
106
 
107
+ if __name__ == "__main__":
 
 
 
 
 
 
 
 
108
  main()