File size: 8,727 Bytes
4556f47
ae90dd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4556f47
ae90dd6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Page config
st.set_page_config(
    page_title="PDF RAG Chatbot",
    page_icon="πŸ“š",
    layout="wide"
)

# Initialize session state
if 'processed' not in st.session_state:
    st.session_state.processed = False
if 'chunks' not in st.session_state:
    st.session_state.chunks = []
if 'index' not in st.session_state:
    st.session_state.index = None
if 'embeddings_model' not in st.session_state:
    st.session_state.embeddings_model = None
if 'qa_model' not in st.session_state:
    st.session_state.qa_model = None

# to extract text from pdf file using pdfReader from pypdf2
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# splitting Extracted text into small small chunks with operlaping text
def split_text_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if chunk.strip():  # Only add non-empty chunks
            chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

# feed chunks to model to encode and return embeddings 
def create_embeddings(chunks, model):
    embeddings = model.encode(chunks, show_progress_bar=True)
    return embeddings

# Index embeddings into FAISS local index
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.astype('float32'))
    return index

# Search for similar chunks using FAISS
def search_similar_chunks(query, model, index, chunks, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding.astype('float32'), k)
    return [chunks[i] for i in indices[0]]

# Generate answer using Open Source Model google/flan-t5-base
def generate_answer(question, context, qa_model):
    
    max_context_length = 2000 # Combine context (limit to avoid token limits)
    if len(context) > max_context_length:
        context = context[:max_context_length]
    
    input_text = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    
    # Generate answer
    result = qa_model(input_text, max_length=200, min_length=20, do_sample=False)
    answer = result[0]['generated_text']
    
    if "Answer:" in answer:
        answer = answer.split("Answer:")[-1].strip()
    
    return answer

# Main User Interface webpage
st.title("πŸ“š PDF-Based RAG Chatbot")
st.markdown("Upload two PDF documents and ask questions about their content!")
st.markdown("**100% Free** - Uses open-source models from Hugging Face")

with st.sidebar:
    st.header("πŸ“„ Upload PDFs")
    pdf1 = st.file_uploader("Upload PDF 1", type=['pdf'], key="pdf1")
    pdf2 = st.file_uploader("Upload PDF 2", type=['pdf'], key="pdf2")
    
    st.markdown("---")
    
    if st.button("πŸ”„ Process PDFs", type="primary"):
        if not pdf1 or not pdf2:
            st.error("Please upload both PDF files!")
        else:
            with st.spinner("Processing PDFs... This may take a minute on first run."):
                try:
                    # Extract text from both PDFs
                    st.info("πŸ“– Reading PDFs...")
                    text1 = extract_text_from_pdf(pdf1)
                    text2 = extract_text_from_pdf(pdf2)
                    combined_text = text1 + "\n\n" + text2
                    
                    # Split into chunks
                    st.info("βœ‚οΈ Splitting text into chunks...")
                    chunks = split_text_into_chunks(combined_text)
                    st.session_state.chunks = chunks
                    
                    # Load embedding model
                    if st.session_state.embeddings_model is None:
                        st.info("πŸ”§ Loading embedding model...")
                        st.session_state.embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
                    
                    # Create embeddings
                    st.info("πŸ” Creating embeddings...")
                    embeddings = create_embeddings(chunks, st.session_state.embeddings_model)
                    
                    # Create FAISS index
                    st.info("πŸ“Š Building search index...")
                    st.session_state.index = create_faiss_index(embeddings)
                    
                    # Load QA model
                    if st.session_state.qa_model is None:
                        st.info("πŸ€– Loading question-answering model...")
                        st.session_state.qa_model = pipeline(
                            "text2text-generation",
                            model="google/flan-t5-base"
                        )
                    
                    st.session_state.processed = True
                    st.success(f"βœ… Successfully processed {len(chunks)} chunks from both PDFs!")
                    
                except Exception as e:
                    st.error(f"Error: {str(e)}")
    
    if st.session_state.processed:
        st.success("βœ… PDFs are ready!")
        st.info(f"πŸ“¦ Total chunks: {len(st.session_state.chunks)}")
    
    st.markdown("---")
    st.markdown("""
    ### πŸ› οΈ Tech Stack:
    - **Streamlit**: UI
    - **PyPDF2**: PDF reading
    - **Sentence Transformers**: Embeddings
    - **FAISS**: Vector search
    - **google/flan-t5-base**: Answer generation
    
    All models run locally - no API keys needed!
    """)

# Main content area
if st.session_state.processed:
    st.markdown("### πŸ’¬ Ask Questions")
    
    question = st.text_input(
        "Enter your question:",
        placeholder="What are the main topics in these documents?"
    )
    
    col1, col2 = st.columns([1, 4])
    with col1:
        ask_button = st.button("πŸ” Get Answer", type="primary")
    
    if ask_button:
        if not question:
            st.warning("Please enter a question!")
        else:
            with st.spinner("Searching documents and generating answer..."):
                try:
                    # Search for relevant chunks
                    relevant_chunks = search_similar_chunks(
                        question,
                        st.session_state.embeddings_model,
                        st.session_state.index,
                        st.session_state.chunks,
                        k=3
                    )
                    
                    # Combine chunks as context
                    context = "\n\n".join(relevant_chunks)
                    
                    # Generate answer
                    answer = generate_answer(question, context, st.session_state.qa_model)
                    
                    # Display answer
                    st.markdown("### πŸ“ Answer:")
                    st.success(answer)
                    
                    # Show relevant chunks
                    with st.expander("πŸ“„ View source text chunks"):
                        for i, chunk in enumerate(relevant_chunks, 1):
                            st.markdown(f"**Chunk {i}:**")
                            st.text(chunk[:400] + "..." if len(chunk) > 400 else chunk)
                            if i < len(relevant_chunks):
                                st.markdown("---")
                    
                except Exception as e:
                    st.error(f"Error: {str(e)}")
else:
    st.info("πŸ‘ˆ Please upload two PDFs and click 'Process PDFs' to get started!")
    
    st.markdown("""
    ### πŸ“– How to Use:
    
    1. **Upload PDFs**: Upload two PDF documents in the sidebar <- add as much as you want
    2. **Process**: Click "Process PDFs" button (takes ~30 seconds first time because it needs to do multiple process)
    3. **Ask Questions**: Type your question and click "Get Answer"
    4. **View Sources**: Expand to see which text chunks were used
    
    ### πŸ’‘ Example Questions:
    - What are the main topics in these documents?
    - Summarize the key findings
    - What does the document say about [specific topic]?
    - List the important points mentioned
    
    ### ✨ Features:
    - βœ… 2 document processing at a time concurently
    - βœ… FAISS local searching for retrival of similar chunks
    - βœ… Open source - Uses Hugging Face models
    - βœ… Fast search - FAISS vector similarity
    """)

# Footer
st.markdown("---")
st.markdown("Built for Algorizz for Interview round using Streamlit, Sentence Transformers, FAISS, and FLAN-T5 model")