Spaces:

vinaykamble289
/

rag-chatbot

Sleeping

File size: 8,727 Bytes

import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Page config
st.set_page_config(
    page_title="PDF RAG Chatbot",
    page_icon="📚",
    layout="wide"
)

# Initialize session state
if 'processed' not in st.session_state:
    st.session_state.processed = False
if 'chunks' not in st.session_state:
    st.session_state.chunks = []
if 'index' not in st.session_state:
    st.session_state.index = None
if 'embeddings_model' not in st.session_state:
    st.session_state.embeddings_model = None
if 'qa_model' not in st.session_state:
    st.session_state.qa_model = None

# to extract text from pdf file using pdfReader from pypdf2
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# splitting Extracted text into small small chunks with operlaping text
def split_text_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if chunk.strip():  # Only add non-empty chunks
            chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

# feed chunks to model to encode and return embeddings 
def create_embeddings(chunks, model):
    embeddings = model.encode(chunks, show_progress_bar=True)
    return embeddings

# Index embeddings into FAISS local index
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.astype('float32'))
    return index

# Search for similar chunks using FAISS
def search_similar_chunks(query, model, index, chunks, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding.astype('float32'), k)
    return [chunks[i] for i in indices[0]]

# Generate answer using Open Source Model google/flan-t5-base
def generate_answer(question, context, qa_model):
    
    max_context_length = 2000 # Combine context (limit to avoid token limits)
    if len(context) > max_context_length:
        context = context[:max_context_length]
    
    input_text = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    
    # Generate answer
    result = qa_model(input_text, max_length=200, min_length=20, do_sample=False)
    answer = result[0]['generated_text']
    
    if "Answer:" in answer:
        answer = answer.split("Answer:")[-1].strip()
    
    return answer

# Main User Interface webpage
st.title("📚 PDF-Based RAG Chatbot")
st.markdown("Upload two PDF documents and ask questions about their content!")
st.markdown("**100% Free** - Uses open-source models from Hugging Face")

with st.sidebar:
    st.header("📄 Upload PDFs")
    pdf1 = st.file_uploader("Upload PDF 1", type=['pdf'], key="pdf1")
    pdf2 = st.file_uploader("Upload PDF 2", type=['pdf'], key="pdf2")
    
    st.markdown("---")
    
    if st.button("🔄 Process PDFs", type="primary"):
        if not pdf1 or not pdf2:
            st.error("Please upload both PDF files!")
        else:
            with st.spinner("Processing PDFs... This may take a minute on first run."):
                try:
                    # Extract text from both PDFs
                    st.info("📖 Reading PDFs...")
                    text1 = extract_text_from_pdf(pdf1)
                    text2 = extract_text_from_pdf(pdf2)
                    combined_text = text1 + "\n\n" + text2
                    
                    # Split into chunks
                    st.info("✂️ Splitting text into chunks...")
                    chunks = split_text_into_chunks(combined_text)
                    st.session_state.chunks = chunks
                    
                    # Load embedding model
                    if st.session_state.embeddings_model is None:
                        st.info("🔧 Loading embedding model...")
                        st.session_state.embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
                    
                    # Create embeddings
                    st.info("🔍 Creating embeddings...")
                    embeddings = create_embeddings(chunks, st.session_state.embeddings_model)
                    
                    # Create FAISS index
                    st.info("📊 Building search index...")
                    st.session_state.index = create_faiss_index(embeddings)
                    
                    # Load QA model
                    if st.session_state.qa_model is None:
                        st.info("🤖 Loading question-answering model...")
                        st.session_state.qa_model = pipeline(
                            "text2text-generation",
                            model="google/flan-t5-base"
                        )
                    
                    st.session_state.processed = True
                    st.success(f"✅ Successfully processed {len(chunks)} chunks from both PDFs!")
                    
                except Exception as e:
                    st.error(f"Error: {str(e)}")
    
    if st.session_state.processed:
        st.success("✅ PDFs are ready!")
        st.info(f"📦 Total chunks: {len(st.session_state.chunks)}")
    
    st.markdown("---")
    st.markdown("""
    ### 🛠️ Tech Stack:
    - **Streamlit**: UI
    - **PyPDF2**: PDF reading
    - **Sentence Transformers**: Embeddings
    - **FAISS**: Vector search
    - **google/flan-t5-base**: Answer generation
    
    All models run locally - no API keys needed!
    """)

# Main content area
if st.session_state.processed:
    st.markdown("### 💬 Ask Questions")
    
    question = st.text_input(
        "Enter your question:",
        placeholder="What are the main topics in these documents?"
    )
    
    col1, col2 = st.columns([1, 4])
    with col1:
        ask_button = st.button("🔍 Get Answer", type="primary")
    
    if ask_button:
        if not question:
            st.warning("Please enter a question!")
        else:
            with st.spinner("Searching documents and generating answer..."):
                try:
                    # Search for relevant chunks
                    relevant_chunks = search_similar_chunks(
                        question,
                        st.session_state.embeddings_model,
                        st.session_state.index,
                        st.session_state.chunks,
                        k=3
                    )
                    
                    # Combine chunks as context
                    context = "\n\n".join(relevant_chunks)
                    
                    # Generate answer
                    answer = generate_answer(question, context, st.session_state.qa_model)
                    
                    # Display answer
                    st.markdown("### 📝 Answer:")
                    st.success(answer)
                    
                    # Show relevant chunks
                    with st.expander("📄 View source text chunks"):
                        for i, chunk in enumerate(relevant_chunks, 1):
                            st.markdown(f"**Chunk {i}:**")
                            st.text(chunk[:400] + "..." if len(chunk) > 400 else chunk)
                            if i < len(relevant_chunks):
                                st.markdown("---")
                    
                except Exception as e:
                    st.error(f"Error: {str(e)}")
else:
    st.info("👈 Please upload two PDFs and click 'Process PDFs' to get started!")
    
    st.markdown("""
    ### 📖 How to Use:
    
    1. **Upload PDFs**: Upload two PDF documents in the sidebar <- add as much as you want
    2. **Process**: Click "Process PDFs" button (takes ~30 seconds first time because it needs to do multiple process)
    3. **Ask Questions**: Type your question and click "Get Answer"
    4. **View Sources**: Expand to see which text chunks were used
    
    ### 💡 Example Questions:
    - What are the main topics in these documents?
    - Summarize the key findings
    - What does the document say about [specific topic]?
    - List the important points mentioned
    
    ### ✨ Features:
    - ✅ 2 document processing at a time concurently
    - ✅ FAISS local searching for retrival of similar chunks
    - ✅ Open source - Uses Hugging Face models
    - ✅ Fast search - FAISS vector similarity
    """)

# Footer
st.markdown("---")
st.markdown("Built for Algorizz for Interview round using Streamlit, Sentence Transformers, FAISS, and FLAN-T5 model")