Spaces:

Emperor2004
/

RAG-Powered_Study_and_QA_Chatbot

Build error

App Files Files Community

RAG-Powered_Study_and_QA_Chatbot / app.py

Emperor2004

Upload app.py

3a6f9c8 verified 10 months ago

raw

history blame contribute delete

6.12 kB

	# Import necessary libraries
	import streamlit as st
	import asyncio
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
	from langchain.vectorstores import FAISS
	from langchain.chains.question_answering import load_qa_chain
	from langchain.prompts import PromptTemplate
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()
	api_key = st.secrets["GOOGLE_API_KEY"]

	# Configure Gemini API
	if api_key:
	import google.generativeai as genai
	genai.configure(api_key=api_key)
	else:
	st.error("Google API Key not found. Please set it in the .env file.")
	st.stop()

	# --- PDF Processing and Text Chunking ---
	def get_chunks_from_pdfs(pdf_docs):
	"""Extracts text from PDFs, splits it into chunks, and attaches metadata."""
	chunks_with_metadata = []
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)

	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page_num, page in enumerate(pdf_reader.pages):
	text = page.extract_text()
	if text:
	chunks = text_splitter.split_text(text)
	for chunk in chunks:
	# Create a dictionary for each chunk with its content and metadata
	chunks_with_metadata.append({
	"content": chunk,
	"metadata": {"source": pdf.name, "page": page_num + 1}
	})
	return chunks_with_metadata

	# --- Vector Store Creation ---
	def get_vector_store(chunks_with_metadata):
	"""Creates and saves a vector store from text chunks with metadata."""
	if not chunks_with_metadata:
	st.warning("No text chunks to process. Please upload and process PDFs.")
	return

	try:
	# Initialize a new event loop for async operations
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)

	# Initialize embeddings
	embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

	# Extract just the content for embedding, but prepare metadata
	texts = [chunk["content"] for chunk in chunks_with_metadata]
	metadatas = [chunk["metadata"] for chunk in chunks_with_metadata]

	# Use from_texts which accepts metadata
	vector_store = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
	st.session_state.vector_store = vector_store
	st.success("Vector Store created successfully!")

	except Exception as e:
	st.error(f"Error creating vector store: {e}")

	# --- Conversational Chain Creation ---
	def get_conversational_chain():
	"""Creates a conversational QA chain with a custom prompt."""
	prompt_template = """
	Answer the question as detailed as possible from the provided context. If the answer is not in
	the provided context, just say, "The answer is not available in the context". Don't provide a wrong answer.

	Context:
	{context}

	Question:
	{question}

	Answer:
	"""
	model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.3)
	prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
	chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
	return chain

	# --- Streamlit UI ---

	# Page configuration
	st.set_page_config(page_title="📚 RAG Study Bot", layout="wide")
	st.title("📚 RAG-powered Study and QA Chatbot")

	# Initialize session state for vector store
	if 'vector_store' not in st.session_state:
	st.session_state.vector_store = None

	# Sidebar for PDF upload and processing
	with st.sidebar:
	st.header("Your Study Documents")
	pdf_docs = st.file_uploader("Upload PDF Files and Click 'Process'", accept_multiple_files=True, type="pdf")
	if st.button("Process Documents"):
	if pdf_docs:
	with st.spinner("Processing documents..."):
	# 1. Get chunks with metadata
	chunks = get_chunks_from_pdfs(pdf_docs)
	# 2. Create vector store
	get_vector_store(chunks)
	else:
	st.warning("Please upload at least one PDF file.")

	# Main area for question input and answer display
	st.header("Ask a Question")
	user_question = st.text_input("What would you like to know from your documents?")

	# Button to get answer
	if st.button("Get Answer"):
	if user_question:
	if st.session_state.vector_store:
	with st.spinner("Searching for the answer..."):
	try:
	vector_store = st.session_state.vector_store
	docs = vector_store.similarity_search(user_question)
	chain = get_conversational_chain()
	response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)

	answer_text = response["output_text"]

	st.subheader("Answer:")
	st.write(answer_text)

	# Display sources only if the answer is found in the context
	if "the answer is not available in the context" not in answer_text.lower():
	st.subheader("Sources:")
	sources = set()
	for doc in docs:
	sources.add(f"File: {doc.metadata['source']} \| Page: {doc.metadata['page']}")

	for source in sources:
	st.markdown(f"- {source}")

	except Exception as e:
	st.error(f"An error occurred: {e}")
	else:
	st.warning("Documents not processed. Please upload and process your PDFs first.")
	else:
	st.warning("Please enter a question.")