# Import necessary libraries import streamlit as st import asyncio from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI from langchain.vectorstores import FAISS from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate from dotenv import load_dotenv # Load environment variables load_dotenv() api_key = st.secrets["GOOGLE_API_KEY"] # Configure Gemini API if api_key: import google.generativeai as genai genai.configure(api_key=api_key) else: st.error("Google API Key not found. Please set it in the .env file.") st.stop() # --- PDF Processing and Text Chunking --- def get_chunks_from_pdfs(pdf_docs): """Extracts text from PDFs, splits it into chunks, and attaches metadata.""" chunks_with_metadata = [] text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) for pdf in pdf_docs: pdf_reader = PdfReader(pdf) for page_num, page in enumerate(pdf_reader.pages): text = page.extract_text() if text: chunks = text_splitter.split_text(text) for chunk in chunks: # Create a dictionary for each chunk with its content and metadata chunks_with_metadata.append({ "content": chunk, "metadata": {"source": pdf.name, "page": page_num + 1} }) return chunks_with_metadata # --- Vector Store Creation --- def get_vector_store(chunks_with_metadata): """Creates and saves a vector store from text chunks with metadata.""" if not chunks_with_metadata: st.warning("No text chunks to process. Please upload and process PDFs.") return try: # Initialize a new event loop for async operations loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) # Initialize embeddings embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") # Extract just the content for embedding, but prepare metadata texts = [chunk["content"] for chunk in chunks_with_metadata] metadatas = [chunk["metadata"] for chunk in chunks_with_metadata] # Use from_texts which accepts metadata vector_store = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas) st.session_state.vector_store = vector_store st.success("Vector Store created successfully!") except Exception as e: st.error(f"Error creating vector store: {e}") # --- Conversational Chain Creation --- def get_conversational_chain(): """Creates a conversational QA chain with a custom prompt.""" prompt_template = """ Answer the question as detailed as possible from the provided context. If the answer is not in the provided context, just say, "The answer is not available in the context". Don't provide a wrong answer. Context: {context} Question: {question} Answer: """ model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.3) prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) chain = load_qa_chain(model, chain_type="stuff", prompt=prompt) return chain # --- Streamlit UI --- # Page configuration st.set_page_config(page_title="📚 RAG Study Bot", layout="wide") st.title("📚 RAG-powered Study and QA Chatbot") # Initialize session state for vector store if 'vector_store' not in st.session_state: st.session_state.vector_store = None # Sidebar for PDF upload and processing with st.sidebar: st.header("Your Study Documents") pdf_docs = st.file_uploader("Upload PDF Files and Click 'Process'", accept_multiple_files=True, type="pdf") if st.button("Process Documents"): if pdf_docs: with st.spinner("Processing documents..."): # 1. Get chunks with metadata chunks = get_chunks_from_pdfs(pdf_docs) # 2. Create vector store get_vector_store(chunks) else: st.warning("Please upload at least one PDF file.") # Main area for question input and answer display st.header("Ask a Question") user_question = st.text_input("What would you like to know from your documents?") # Button to get answer if st.button("Get Answer"): if user_question: if st.session_state.vector_store: with st.spinner("Searching for the answer..."): try: vector_store = st.session_state.vector_store docs = vector_store.similarity_search(user_question) chain = get_conversational_chain() response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True) answer_text = response["output_text"] st.subheader("Answer:") st.write(answer_text) # Display sources only if the answer is found in the context if "the answer is not available in the context" not in answer_text.lower(): st.subheader("Sources:") sources = set() for doc in docs: sources.add(f"File: **{doc.metadata['source']}** | Page: **{doc.metadata['page']}**") for source in sources: st.markdown(f"- {source}") except Exception as e: st.error(f"An error occurred: {e}") else: st.warning("Documents not processed. Please upload and process your PDFs first.") else: st.warning("Please enter a question.")