Spaces:
Sleeping
Sleeping
File size: 8,727 Bytes
4556f47 ae90dd6 4556f47 ae90dd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
# Page config
st.set_page_config(
page_title="PDF RAG Chatbot",
page_icon="π",
layout="wide"
)
# Initialize session state
if 'processed' not in st.session_state:
st.session_state.processed = False
if 'chunks' not in st.session_state:
st.session_state.chunks = []
if 'index' not in st.session_state:
st.session_state.index = None
if 'embeddings_model' not in st.session_state:
st.session_state.embeddings_model = None
if 'qa_model' not in st.session_state:
st.session_state.qa_model = None
# to extract text from pdf file using pdfReader from pypdf2
def extract_text_from_pdf(pdf_file):
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# splitting Extracted text into small small chunks with operlaping text
def split_text_into_chunks(text, chunk_size=1000, overlap=200):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
if chunk.strip(): # Only add non-empty chunks
chunks.append(chunk)
start += chunk_size - overlap
return chunks
# feed chunks to model to encode and return embeddings
def create_embeddings(chunks, model):
embeddings = model.encode(chunks, show_progress_bar=True)
return embeddings
# Index embeddings into FAISS local index
def create_faiss_index(embeddings):
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.astype('float32'))
return index
# Search for similar chunks using FAISS
def search_similar_chunks(query, model, index, chunks, k=3):
query_embedding = model.encode([query])
distances, indices = index.search(query_embedding.astype('float32'), k)
return [chunks[i] for i in indices[0]]
# Generate answer using Open Source Model google/flan-t5-base
def generate_answer(question, context, qa_model):
max_context_length = 2000 # Combine context (limit to avoid token limits)
if len(context) > max_context_length:
context = context[:max_context_length]
input_text = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
# Generate answer
result = qa_model(input_text, max_length=200, min_length=20, do_sample=False)
answer = result[0]['generated_text']
if "Answer:" in answer:
answer = answer.split("Answer:")[-1].strip()
return answer
# Main User Interface webpage
st.title("π PDF-Based RAG Chatbot")
st.markdown("Upload two PDF documents and ask questions about their content!")
st.markdown("**100% Free** - Uses open-source models from Hugging Face")
with st.sidebar:
st.header("π Upload PDFs")
pdf1 = st.file_uploader("Upload PDF 1", type=['pdf'], key="pdf1")
pdf2 = st.file_uploader("Upload PDF 2", type=['pdf'], key="pdf2")
st.markdown("---")
if st.button("π Process PDFs", type="primary"):
if not pdf1 or not pdf2:
st.error("Please upload both PDF files!")
else:
with st.spinner("Processing PDFs... This may take a minute on first run."):
try:
# Extract text from both PDFs
st.info("π Reading PDFs...")
text1 = extract_text_from_pdf(pdf1)
text2 = extract_text_from_pdf(pdf2)
combined_text = text1 + "\n\n" + text2
# Split into chunks
st.info("βοΈ Splitting text into chunks...")
chunks = split_text_into_chunks(combined_text)
st.session_state.chunks = chunks
# Load embedding model
if st.session_state.embeddings_model is None:
st.info("π§ Loading embedding model...")
st.session_state.embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
# Create embeddings
st.info("π Creating embeddings...")
embeddings = create_embeddings(chunks, st.session_state.embeddings_model)
# Create FAISS index
st.info("π Building search index...")
st.session_state.index = create_faiss_index(embeddings)
# Load QA model
if st.session_state.qa_model is None:
st.info("π€ Loading question-answering model...")
st.session_state.qa_model = pipeline(
"text2text-generation",
model="google/flan-t5-base"
)
st.session_state.processed = True
st.success(f"β
Successfully processed {len(chunks)} chunks from both PDFs!")
except Exception as e:
st.error(f"Error: {str(e)}")
if st.session_state.processed:
st.success("β
PDFs are ready!")
st.info(f"π¦ Total chunks: {len(st.session_state.chunks)}")
st.markdown("---")
st.markdown("""
### π οΈ Tech Stack:
- **Streamlit**: UI
- **PyPDF2**: PDF reading
- **Sentence Transformers**: Embeddings
- **FAISS**: Vector search
- **google/flan-t5-base**: Answer generation
All models run locally - no API keys needed!
""")
# Main content area
if st.session_state.processed:
st.markdown("### π¬ Ask Questions")
question = st.text_input(
"Enter your question:",
placeholder="What are the main topics in these documents?"
)
col1, col2 = st.columns([1, 4])
with col1:
ask_button = st.button("π Get Answer", type="primary")
if ask_button:
if not question:
st.warning("Please enter a question!")
else:
with st.spinner("Searching documents and generating answer..."):
try:
# Search for relevant chunks
relevant_chunks = search_similar_chunks(
question,
st.session_state.embeddings_model,
st.session_state.index,
st.session_state.chunks,
k=3
)
# Combine chunks as context
context = "\n\n".join(relevant_chunks)
# Generate answer
answer = generate_answer(question, context, st.session_state.qa_model)
# Display answer
st.markdown("### π Answer:")
st.success(answer)
# Show relevant chunks
with st.expander("π View source text chunks"):
for i, chunk in enumerate(relevant_chunks, 1):
st.markdown(f"**Chunk {i}:**")
st.text(chunk[:400] + "..." if len(chunk) > 400 else chunk)
if i < len(relevant_chunks):
st.markdown("---")
except Exception as e:
st.error(f"Error: {str(e)}")
else:
st.info("π Please upload two PDFs and click 'Process PDFs' to get started!")
st.markdown("""
### π How to Use:
1. **Upload PDFs**: Upload two PDF documents in the sidebar <- add as much as you want
2. **Process**: Click "Process PDFs" button (takes ~30 seconds first time because it needs to do multiple process)
3. **Ask Questions**: Type your question and click "Get Answer"
4. **View Sources**: Expand to see which text chunks were used
### π‘ Example Questions:
- What are the main topics in these documents?
- Summarize the key findings
- What does the document say about [specific topic]?
- List the important points mentioned
### β¨ Features:
- β
2 document processing at a time concurently
- β
FAISS local searching for retrival of similar chunks
- β
Open source - Uses Hugging Face models
- β
Fast search - FAISS vector similarity
""")
# Footer
st.markdown("---")
st.markdown("Built for Algorizz for Interview round using Streamlit, Sentence Transformers, FAISS, and FLAN-T5 model")
|