RAG-document_QA / app.py
mshabir's picture
Upload 3 files
e9f434c verified
import streamlit as st
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
import os
# Page config
st.set_page_config(page_title="RAG Document Q&A", page_icon="πŸ“š", layout="wide")
# Title
st.title("πŸ“š RAG Document Q&A System")
st.markdown("Upload PDFs and ask questions about them!")
# Sidebar for API key
with st.sidebar:
st.header("βš™οΈ Configuration")
api_key = st.text_input("Enter Groq API Key:", type="password")
st.markdown("[Get free API key from Groq](https://console.groq.com/)")
st.markdown("---")
st.markdown("### About")
st.markdown("This RAG system uses:")
st.markdown("- πŸ€– Groq (Llama 3.3)")
st.markdown("- πŸ” Vector Search")
st.markdown("- πŸ“„ PDF Processing")
# Initialize session state
if 'vectorstore' not in st.session_state:
st.session_state.vectorstore = None
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
# Main area
col1, col2 = st.columns([1, 2])
with col1:
st.header("πŸ“€ Upload Documents")
uploaded_files = st.file_uploader(
"Upload PDF files",
type=['pdf'],
accept_multiple_files=True
)
if uploaded_files and api_key:
if st.button("πŸ”„ Process Documents", type="primary"):
with st.spinner("Processing PDFs..."):
try:
# Extract text from all PDFs
all_text = ""
for pdf_file in uploaded_files:
pdf_reader = PdfReader(pdf_file)
for page in pdf_reader.pages:
all_text += page.extract_text()
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = text_splitter.split_text(all_text)
# Create embeddings and vector store
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
st.session_state.vectorstore = Chroma.from_texts(
texts=chunks,
embedding=embeddings
)
st.success(f"βœ… Processed {len(uploaded_files)} PDF(s) into {len(chunks)} chunks!")
except Exception as e:
st.error(f"Error: {str(e)}")
with col2:
st.header("πŸ’¬ Ask Questions")
if st.session_state.vectorstore and api_key:
# Question input
question = st.text_input("Ask a question about your documents:")
if question:
with st.spinner("Thinking..."):
try:
# Setup LLM
os.environ["GROQ_API_KEY"] = api_key
llm = ChatGroq(
model="llama-3.3-70b-versatile",
temperature=0
)
# Get relevant docs
docs = st.session_state.vectorstore.similarity_search(question, k=3)
context = "\n\n".join([doc.page_content for doc in docs])
# Create prompt
prompt = f"""Answer based only on this context:
{context}
Question: {question}
Answer:"""
# Get answer
answer = llm.invoke(prompt)
# Display answer
st.markdown("### πŸ’‘ Answer")
st.markdown(answer.content)
# Show sources
with st.expander("πŸ“š View Sources"):
for i, doc in enumerate(docs, 1):
st.markdown(f"**Source {i}:**")
st.text(doc.page_content[:300] + "...")
st.markdown("---")
# Add to history
st.session_state.chat_history.append({
"question": question,
"answer": answer.content
})
except Exception as e:
st.error(f"Error: {str(e)}")
# Show chat history
if st.session_state.chat_history:
st.markdown("### πŸ“œ Chat History")
for i, chat in enumerate(reversed(st.session_state.chat_history[-5:]), 1):
with st.expander(f"Q{i}: {chat['question'][:50]}..."):
st.markdown(f"**Q:** {chat['question']}")
st.markdown(f"**A:** {chat['answer']}")
else:
st.info("πŸ‘ˆ Upload PDFs and enter API key to get started!")
# Footer
st.markdown("---")
st.markdown("Built with Streamlit, LangChain, and Groq πŸš€")