Emperor2004's picture
Upload app.py
3a6f9c8 verified
# Import necessary libraries
import streamlit as st
import asyncio
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
api_key = st.secrets["GOOGLE_API_KEY"]
# Configure Gemini API
if api_key:
import google.generativeai as genai
genai.configure(api_key=api_key)
else:
st.error("Google API Key not found. Please set it in the .env file.")
st.stop()
# --- PDF Processing and Text Chunking ---
def get_chunks_from_pdfs(pdf_docs):
"""Extracts text from PDFs, splits it into chunks, and attaches metadata."""
chunks_with_metadata = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page_num, page in enumerate(pdf_reader.pages):
text = page.extract_text()
if text:
chunks = text_splitter.split_text(text)
for chunk in chunks:
# Create a dictionary for each chunk with its content and metadata
chunks_with_metadata.append({
"content": chunk,
"metadata": {"source": pdf.name, "page": page_num + 1}
})
return chunks_with_metadata
# --- Vector Store Creation ---
def get_vector_store(chunks_with_metadata):
"""Creates and saves a vector store from text chunks with metadata."""
if not chunks_with_metadata:
st.warning("No text chunks to process. Please upload and process PDFs.")
return
try:
# Initialize a new event loop for async operations
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Initialize embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# Extract just the content for embedding, but prepare metadata
texts = [chunk["content"] for chunk in chunks_with_metadata]
metadatas = [chunk["metadata"] for chunk in chunks_with_metadata]
# Use from_texts which accepts metadata
vector_store = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
st.session_state.vector_store = vector_store
st.success("Vector Store created successfully!")
except Exception as e:
st.error(f"Error creating vector store: {e}")
# --- Conversational Chain Creation ---
def get_conversational_chain():
"""Creates a conversational QA chain with a custom prompt."""
prompt_template = """
Answer the question as detailed as possible from the provided context. If the answer is not in
the provided context, just say, "The answer is not available in the context". Don't provide a wrong answer.
Context:
{context}
Question:
{question}
Answer:
"""
model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.3)
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
return chain
# --- Streamlit UI ---
# Page configuration
st.set_page_config(page_title="๐Ÿ“š RAG Study Bot", layout="wide")
st.title("๐Ÿ“š RAG-powered Study and QA Chatbot")
# Initialize session state for vector store
if 'vector_store' not in st.session_state:
st.session_state.vector_store = None
# Sidebar for PDF upload and processing
with st.sidebar:
st.header("Your Study Documents")
pdf_docs = st.file_uploader("Upload PDF Files and Click 'Process'", accept_multiple_files=True, type="pdf")
if st.button("Process Documents"):
if pdf_docs:
with st.spinner("Processing documents..."):
# 1. Get chunks with metadata
chunks = get_chunks_from_pdfs(pdf_docs)
# 2. Create vector store
get_vector_store(chunks)
else:
st.warning("Please upload at least one PDF file.")
# Main area for question input and answer display
st.header("Ask a Question")
user_question = st.text_input("What would you like to know from your documents?")
# Button to get answer
if st.button("Get Answer"):
if user_question:
if st.session_state.vector_store:
with st.spinner("Searching for the answer..."):
try:
vector_store = st.session_state.vector_store
docs = vector_store.similarity_search(user_question)
chain = get_conversational_chain()
response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
answer_text = response["output_text"]
st.subheader("Answer:")
st.write(answer_text)
# Display sources only if the answer is found in the context
if "the answer is not available in the context" not in answer_text.lower():
st.subheader("Sources:")
sources = set()
for doc in docs:
sources.add(f"File: **{doc.metadata['source']}** | Page: **{doc.metadata['page']}**")
for source in sources:
st.markdown(f"- {source}")
except Exception as e:
st.error(f"An error occurred: {e}")
else:
st.warning("Documents not processed. Please upload and process your PDFs first.")
else:
st.warning("Please enter a question.")