snehakingrani commited on
Commit
617564f
·
verified ·
1 Parent(s): 6bd1187

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -29
app.py CHANGED
@@ -1,40 +1,109 @@
 
 
 
1
  import streamlit as st
2
- import PyPDF2
3
- from langchain.text_splitter import RecursiveCharacterTextSplitter
 
4
  from langchain.vectorstores import FAISS
5
- from langchain.embeddings import HuggingFaceEmbeddings # Use Hugging Face for embeddings
 
6
  from langchain_groq import ChatGroq
7
- from langchain.chains import RetrievalQA
8
 
9
- # Set up Groq API key
10
- groq_api_key = "your_groq_api_key"
 
11
 
12
- # Initialize LLM using Groq API
13
- llm = ChatGroq(model_name="llama3-70b", api_key=groq_api_key)
 
 
 
14
 
15
- # Use Hugging Face Embeddings
16
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
 
17
 
18
- uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
 
 
 
 
 
 
 
 
 
19
 
20
- if uploaded_file:
21
- with st.spinner("Processing PDF..."):
22
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
23
- text = "".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])
 
24
 
25
- # Split text into smaller chunks for better retrieval
26
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
27
- texts = text_splitter.split_text(text)
28
-
29
- # Convert text to embeddings and store in FAISS
30
- vector_store = FAISS.from_texts(texts, embeddings)
31
- retriever = vector_store.as_retriever()
32
- qa_chain = RetrievalQA(llm=llm, retriever=retriever)
33
 
34
- st.success("PDF processed successfully! Ask your questions below.")
 
 
 
 
35
 
36
- query = st.text_input("Ask a question about the PDF")
37
- if query:
38
- response = qa_chain.run(query)
39
- st.write("### Answer:")
40
- st.write(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from dotenv import load_dotenv
4
  import streamlit as st
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores import FAISS
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
  from langchain_groq import ChatGroq
 
12
 
13
+ # Load environment variables
14
+ load_dotenv()
15
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
16
 
17
+ # Set up logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(levelname)s - %(message)s'
21
+ )
22
 
23
+ # Function to extract text from PDF files
24
+ def get_pdf_text(pdf_docs):
25
+ text = ""
26
+ for pdf in pdf_docs:
27
+ pdf_reader = PdfReader(pdf)
28
+ for page in pdf_reader.pages:
29
+ text += page.extract_text()
30
+ return text
31
 
32
+ # Function to split the extracted text into chunks
33
+ def get_text_chunks(text):
34
+ text_splitter = CharacterTextSplitter(
35
+ separator="\n",
36
+ chunk_size=1000,
37
+ chunk_overlap=200,
38
+ length_function=len
39
+ )
40
+ chunks = text_splitter.split_text(text)
41
+ return chunks
42
 
43
+ # Function to create a FAISS vectorstore
44
+ def get_vectorstore(text_chunks):
45
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
46
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
47
+ return vectorstore
48
 
49
+ # Function to set up the conversational retrieval chain
50
+ def get_conversation_chain(vectorstore):
51
+ try:
52
+ llm = ChatGroq(model="llama-3.3-70b-versatile", groq_api_key=GROQ_API_KEY, temperature=0.5)
53
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
 
 
 
54
 
55
+ conversation_chain = ConversationalRetrievalChain.from_llm(
56
+ llm=llm,
57
+ retriever=vectorstore.as_retriever(),
58
+ memory=memory
59
+ )
60
 
61
+ logging.info("Conversation chain created successfully.")
62
+ return conversation_chain
63
+ except Exception as e:
64
+ logging.error(f"Error creating conversation chain: {e}")
65
+ st.error("An error occurred while setting up the conversation chain.")
66
+
67
+ # Handle user input
68
+ def handle_userinput(user_question):
69
+ if st.session_state.conversation is not None:
70
+ response = st.session_state.conversation({'question': user_question})
71
+ st.session_state.chat_history = response['chat_history']
72
+
73
+ for i, message in enumerate(st.session_state.chat_history):
74
+ if i % 2 == 0:
75
+ st.write(f"*User:* {message.content}")
76
+ else:
77
+ st.write(f"*Bot:* {message.content}")
78
+ else:
79
+ st.warning("Please process the documents first.")
80
+
81
+ # Main function to run the Streamlit app
82
+ def main():
83
+ load_dotenv()
84
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
85
+
86
+ if "conversation" not in st.session_state:
87
+ st.session_state.conversation = None
88
+ if "chat_history" not in st.session_state:
89
+ st.session_state.chat_history = None
90
+
91
+ st.header("Chat with multiple PDFs :books:")
92
+ user_question = st.text_input("Ask a question about your documents:")
93
+ if user_question:
94
+ handle_userinput(user_question)
95
+
96
+ with st.sidebar:
97
+ st.subheader("Your documents")
98
+ pdf_docs = st.file_uploader(
99
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
100
+ )
101
+ if st.button("Process"):
102
+ with st.spinner("Processing..."):
103
+ raw_text = get_pdf_text(pdf_docs)
104
+ text_chunks = get_text_chunks(raw_text)
105
+ vectorstore = get_vectorstore(text_chunks)
106
+ st.session_state.conversation = get_conversation_chain(vectorstore)
107
+
108
+ if __name__ == '__main__':
109
+ main()