yousifalishah commited on
Commit
df29bbf
·
verified ·
1 Parent(s): b456574

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -21
app.py CHANGED
@@ -4,8 +4,8 @@ from dotenv import load_dotenv
4
  import streamlit as st
5
  from PyPDF2 import PdfReader
6
  from langchain.text_splitter import CharacterTextSplitter
7
- from sentence_transformers import SentenceTransformer
8
- from langchain.vectorstores import FAISS
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from groq import Groq
@@ -19,35 +19,39 @@ logging.basicConfig(
19
  format='%(asctime)s - %(levelname)s - %(message)s'
20
  )
21
 
22
- # Function to extract text from PDF files
23
  def get_pdf_text(pdf_docs):
 
24
  text = ""
25
  for pdf in pdf_docs:
26
  pdf_reader = PdfReader(pdf)
27
  for page in pdf_reader.pages:
28
- text += page.extract_text()
29
  return text
30
 
31
- # Function to split the extracted text into chunks
32
  def get_text_chunks(text):
 
33
  text_splitter = CharacterTextSplitter(
34
  separator="\n",
35
  chunk_size=1000,
36
  chunk_overlap=200,
37
  length_function=len
38
  )
39
- chunks = text_splitter.split_text(text)
40
- return chunks
41
 
42
- # Function to create a FAISS vectorstore
43
  def get_vectorstore(text_chunks):
44
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
45
- embeddings = model.encode(text_chunks, convert_to_tensor=True)
46
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
47
- return vectorstore
 
 
 
 
 
 
48
 
49
- # Function to set up the conversational retrieval chain
50
  def get_conversation_chain(vectorstore):
 
51
  try:
52
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
53
  conversation_chain = ConversationalRetrievalChain.from_llm(
@@ -60,12 +64,13 @@ def get_conversation_chain(vectorstore):
60
  except Exception as e:
61
  logging.error(f"Error creating conversation chain: {e}")
62
  st.error("An error occurred while setting up the conversation chain.")
 
63
 
64
- # Handle user input
65
  def handle_userinput(user_question):
 
66
  if st.session_state.conversation is not None:
67
  response = st.session_state.conversation({'question': user_question})
68
- st.session_state.chat_history = response['chat_history']
69
 
70
  for i, message in enumerate(st.session_state.chat_history):
71
  if i % 2 == 0:
@@ -75,15 +80,15 @@ def handle_userinput(user_question):
75
  else:
76
  st.warning("Please process the documents first.")
77
 
78
- # Main function to run the Streamlit app
79
  def main():
 
80
  load_dotenv()
81
  st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
82
 
83
  if "conversation" not in st.session_state:
84
  st.session_state.conversation = None
85
  if "chat_history" not in st.session_state:
86
- st.session_state.chat_history = None
87
 
88
  st.header("Chat with multiple PDFs :books:")
89
  user_question = st.text_input("Ask a question about your documents:")
@@ -92,15 +97,14 @@ def main():
92
 
93
  with st.sidebar:
94
  st.subheader("Your documents")
95
- pdf_docs = st.file_uploader(
96
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
97
- )
98
  if st.button("Process"):
99
  with st.spinner("Processing..."):
100
  raw_text = get_pdf_text(pdf_docs)
101
  text_chunks = get_text_chunks(raw_text)
102
  vectorstore = get_vectorstore(text_chunks)
103
- st.session_state.conversation = get_conversation_chain(vectorstore)
 
104
 
105
  if __name__ == '__main__':
106
  main()
 
4
  import streamlit as st
5
  from PyPDF2 import PdfReader
6
  from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from groq import Groq
 
19
  format='%(asctime)s - %(levelname)s - %(message)s'
20
  )
21
 
 
22
  def get_pdf_text(pdf_docs):
23
+ """Extract text from uploaded PDF files."""
24
  text = ""
25
  for pdf in pdf_docs:
26
  pdf_reader = PdfReader(pdf)
27
  for page in pdf_reader.pages:
28
+ text += page.extract_text() or ""
29
  return text
30
 
 
31
  def get_text_chunks(text):
32
+ """Split the extracted text into manageable chunks."""
33
  text_splitter = CharacterTextSplitter(
34
  separator="\n",
35
  chunk_size=1000,
36
  chunk_overlap=200,
37
  length_function=len
38
  )
39
+ return text_splitter.split_text(text)
 
40
 
 
41
  def get_vectorstore(text_chunks):
42
+ """Create a FAISS vectorstore from text chunks."""
43
+ try:
44
+ embedding_function = SentenceTransformerEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
45
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding_function)
46
+ logging.info("Vectorstore created successfully.")
47
+ return vectorstore
48
+ except Exception as e:
49
+ logging.error(f"Error creating vectorstore: {e}")
50
+ st.error("An error occurred while creating the vectorstore.")
51
+ return None
52
 
 
53
  def get_conversation_chain(vectorstore):
54
+ """Set up the conversational retrieval chain."""
55
  try:
56
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
57
  conversation_chain = ConversationalRetrievalChain.from_llm(
 
64
  except Exception as e:
65
  logging.error(f"Error creating conversation chain: {e}")
66
  st.error("An error occurred while setting up the conversation chain.")
67
+ return None
68
 
 
69
  def handle_userinput(user_question):
70
+ """Handle user input and generate a response."""
71
  if st.session_state.conversation is not None:
72
  response = st.session_state.conversation({'question': user_question})
73
+ st.session_state.chat_history = response.get('chat_history', [])
74
 
75
  for i, message in enumerate(st.session_state.chat_history):
76
  if i % 2 == 0:
 
80
  else:
81
  st.warning("Please process the documents first.")
82
 
 
83
  def main():
84
+ """Run the Streamlit app."""
85
  load_dotenv()
86
  st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
87
 
88
  if "conversation" not in st.session_state:
89
  st.session_state.conversation = None
90
  if "chat_history" not in st.session_state:
91
+ st.session_state.chat_history = []
92
 
93
  st.header("Chat with multiple PDFs :books:")
94
  user_question = st.text_input("Ask a question about your documents:")
 
97
 
98
  with st.sidebar:
99
  st.subheader("Your documents")
100
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
 
 
101
  if st.button("Process"):
102
  with st.spinner("Processing..."):
103
  raw_text = get_pdf_text(pdf_docs)
104
  text_chunks = get_text_chunks(raw_text)
105
  vectorstore = get_vectorstore(text_chunks)
106
+ if vectorstore:
107
+ st.session_state.conversation = get_conversation_chain(vectorstore)
108
 
109
  if __name__ == '__main__':
110
  main()