snehakingrani commited on
Commit
6d852a7
·
verified ·
1 Parent(s): af15820

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -96
app.py CHANGED
@@ -1,108 +1,49 @@
1
- import os
2
- import logging
3
- from dotenv import load_dotenv
4
  import streamlit as st
5
- from PyPDF2 import PdfReader
6
- from langchain.text_splitter import CharacterTextSplitter
7
- from langchain.embeddings import HuggingFaceInstructEmbeddings
 
 
8
  from langchain.vectorstores import FAISS
9
- from langchain.memory import ConversationBufferMemory
10
- from langchain.chains import ConversationalRetrievalChain
11
- from langchain_groq import ChatGroq
 
12
 
13
  # Load environment variables
14
  load_dotenv()
15
 
16
- # Set up logging
17
- logging.basicConfig(
18
- level=logging.INFO,
19
- format='%(asctime)s - %(levelname)s - %(message)s'
20
- )
21
 
22
- # Function to extract text from PDF files
23
- def get_pdf_text(pdf_docs):
24
- text = ""
25
- for pdf in pdf_docs:
26
- pdf_reader = PdfReader(pdf)
27
- for page in pdf_reader.pages:
28
- text += page.extract_text()
29
- return text
30
 
31
- # Function to split the extracted text into chunks
32
- def get_text_chunks(text):
33
- text_splitter = CharacterTextSplitter(
34
- separator="\n",
35
- chunk_size=1000,
36
- chunk_overlap=200,
37
- length_function=len
38
- )
39
- chunks = text_splitter.split_text(text)
40
- return chunks
41
 
42
- # Function to create a FAISS vectorstore
43
- def get_vectorstore(text_chunks):
44
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
45
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
46
- return vectorstore
47
-
48
- # Function to set up the conversational retrieval chain
49
- def get_conversation_chain(vectorstore):
50
- try:
51
- llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
52
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
53
-
54
- conversation_chain = ConversationalRetrievalChain.from_llm(
55
- llm=llm,
56
- retriever=vectorstore.as_retriever(),
57
- memory=memory
58
- )
59
-
60
- logging.info("Conversation chain created successfully.")
61
- return conversation_chain
62
- except Exception as e:
63
- logging.error(f"Error creating conversation chain: {e}")
64
- st.error("An error occurred while setting up the conversation chain.")
65
 
66
- # Handle user input
67
- def handle_userinput(user_question):
68
- if st.session_state.conversation is not None:
69
- response = st.session_state.conversation({'question': user_question})
70
- st.session_state.chat_history = response['chat_history']
71
 
72
- for i, message in enumerate(st.session_state.chat_history):
73
- if i % 2 == 0:
74
- st.write(f"*User:* {message.content}")
75
- else:
76
- st.write(f"*Bot:* {message.content}")
77
- else:
78
- st.warning("Please process the documents first.")
79
 
80
- # Main function to run the Streamlit app
81
- def main():
82
- load_dotenv()
83
- st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
84
-
85
- if "conversation" not in st.session_state:
86
- st.session_state.conversation = None
87
- if "chat_history" not in st.session_state:
88
- st.session_state.chat_history = None
89
-
90
- st.header("Chat with multiple PDFs :books:")
91
- user_question = st.text_input("Ask a question about your documents:")
92
- if user_question:
93
- handle_userinput(user_question)
94
-
95
- with st.sidebar:
96
- st.subheader("Your documents")
97
- pdf_docs = st.file_uploader(
98
- "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
99
- )
100
- if st.button("Process"):
101
- with st.spinner("Processing..."):
102
- raw_text = get_pdf_text(pdf_docs)
103
- text_chunks = get_text_chunks(raw_text)
104
- vectorstore = get_vectorstore(text_chunks)
105
- st.session_state.conversation = get_conversation_chain(vectorstore)
106
-
107
- if __name__ == '__main__':
108
- main()
 
 
 
 
1
  import streamlit as st
2
+ import PyPDF2
3
+ import os
4
+ import faiss
5
+ import numpy as np
6
+ from langchain.embeddings.openai import OpenAIEmbeddings
7
  from langchain.vectorstores import FAISS
8
+ from langchain.llms import OpenAI
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from dotenv import load_dotenv
12
 
13
  # Load environment variables
14
  load_dotenv()
15
 
16
+ # Streamlit UI
17
+ st.title("PDF Q&A Assistant")
18
+ st.write("Upload a PDF and ask questions about its content.")
 
 
19
 
20
+ # Input Groq API Key
21
+ groq_api_key = st.secrets["GROQ_API_KEY"]
 
 
 
 
 
 
22
 
23
+ # Initialize Groq Model
24
+ llm = OpenAI(api_key=groq_api_key, base_url="https://api.groq.com")
25
+ embeddings = OpenAIEmbeddings(api_key=groq_api_key, base_url="https://api.groq.com")
 
 
 
 
 
 
 
26
 
27
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ if uploaded_file:
30
+ with st.spinner("Processing PDF..."):
31
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
32
+ text = "".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])
 
33
 
34
+ # Split text into smaller chunks for better retrieval
35
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
36
+ texts = text_splitter.split_text(text)
 
 
 
 
37
 
38
+ # Convert text to embeddings and store in FAISS
39
+ vector_store = FAISS.from_texts(texts, embeddings)
40
+ retriever = vector_store.as_retriever()
41
+ qa_chain = RetrievalQA(llm=llm, retriever=retriever)
42
+
43
+ st.success("PDF processed successfully! Ask your questions below.")
44
+
45
+ query = st.text_input("Ask a question about the PDF")
46
+ if query:
47
+ response = qa_chain.run(query)
48
+ st.write("### Answer:")
49
+ st.write(response)