Spaces:
Build error
Build error
| # Import necessary libraries | |
| import streamlit as st | |
| import asyncio | |
| from PyPDF2 import PdfReader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI | |
| from langchain.vectorstores import FAISS | |
| from langchain.chains.question_answering import load_qa_chain | |
| from langchain.prompts import PromptTemplate | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| api_key = st.secrets["GOOGLE_API_KEY"] | |
| # Configure Gemini API | |
| if api_key: | |
| import google.generativeai as genai | |
| genai.configure(api_key=api_key) | |
| else: | |
| st.error("Google API Key not found. Please set it in the .env file.") | |
| st.stop() | |
| # --- PDF Processing and Text Chunking --- | |
| def get_chunks_from_pdfs(pdf_docs): | |
| """Extracts text from PDFs, splits it into chunks, and attaches metadata.""" | |
| chunks_with_metadata = [] | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000) | |
| for pdf in pdf_docs: | |
| pdf_reader = PdfReader(pdf) | |
| for page_num, page in enumerate(pdf_reader.pages): | |
| text = page.extract_text() | |
| if text: | |
| chunks = text_splitter.split_text(text) | |
| for chunk in chunks: | |
| # Create a dictionary for each chunk with its content and metadata | |
| chunks_with_metadata.append({ | |
| "content": chunk, | |
| "metadata": {"source": pdf.name, "page": page_num + 1} | |
| }) | |
| return chunks_with_metadata | |
| # --- Vector Store Creation --- | |
| def get_vector_store(chunks_with_metadata): | |
| """Creates and saves a vector store from text chunks with metadata.""" | |
| if not chunks_with_metadata: | |
| st.warning("No text chunks to process. Please upload and process PDFs.") | |
| return | |
| try: | |
| # Initialize a new event loop for async operations | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| # Initialize embeddings | |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | |
| # Extract just the content for embedding, but prepare metadata | |
| texts = [chunk["content"] for chunk in chunks_with_metadata] | |
| metadatas = [chunk["metadata"] for chunk in chunks_with_metadata] | |
| # Use from_texts which accepts metadata | |
| vector_store = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas) | |
| st.session_state.vector_store = vector_store | |
| st.success("Vector Store created successfully!") | |
| except Exception as e: | |
| st.error(f"Error creating vector store: {e}") | |
| # --- Conversational Chain Creation --- | |
| def get_conversational_chain(): | |
| """Creates a conversational QA chain with a custom prompt.""" | |
| prompt_template = """ | |
| Answer the question as detailed as possible from the provided context. If the answer is not in | |
| the provided context, just say, "The answer is not available in the context". Don't provide a wrong answer. | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| Answer: | |
| """ | |
| model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.3) | |
| prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) | |
| chain = load_qa_chain(model, chain_type="stuff", prompt=prompt) | |
| return chain | |
| # --- Streamlit UI --- | |
| # Page configuration | |
| st.set_page_config(page_title="๐ RAG Study Bot", layout="wide") | |
| st.title("๐ RAG-powered Study and QA Chatbot") | |
| # Initialize session state for vector store | |
| if 'vector_store' not in st.session_state: | |
| st.session_state.vector_store = None | |
| # Sidebar for PDF upload and processing | |
| with st.sidebar: | |
| st.header("Your Study Documents") | |
| pdf_docs = st.file_uploader("Upload PDF Files and Click 'Process'", accept_multiple_files=True, type="pdf") | |
| if st.button("Process Documents"): | |
| if pdf_docs: | |
| with st.spinner("Processing documents..."): | |
| # 1. Get chunks with metadata | |
| chunks = get_chunks_from_pdfs(pdf_docs) | |
| # 2. Create vector store | |
| get_vector_store(chunks) | |
| else: | |
| st.warning("Please upload at least one PDF file.") | |
| # Main area for question input and answer display | |
| st.header("Ask a Question") | |
| user_question = st.text_input("What would you like to know from your documents?") | |
| # Button to get answer | |
| if st.button("Get Answer"): | |
| if user_question: | |
| if st.session_state.vector_store: | |
| with st.spinner("Searching for the answer..."): | |
| try: | |
| vector_store = st.session_state.vector_store | |
| docs = vector_store.similarity_search(user_question) | |
| chain = get_conversational_chain() | |
| response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True) | |
| answer_text = response["output_text"] | |
| st.subheader("Answer:") | |
| st.write(answer_text) | |
| # Display sources only if the answer is found in the context | |
| if "the answer is not available in the context" not in answer_text.lower(): | |
| st.subheader("Sources:") | |
| sources = set() | |
| for doc in docs: | |
| sources.add(f"File: **{doc.metadata['source']}** | Page: **{doc.metadata['page']}**") | |
| for source in sources: | |
| st.markdown(f"- {source}") | |
| except Exception as e: | |
| st.error(f"An error occurred: {e}") | |
| else: | |
| st.warning("Documents not processed. Please upload and process your PDFs first.") | |
| else: | |
| st.warning("Please enter a question.") |