Itanutiwari527 commited on
Commit
876b710
Β·
verified Β·
1 Parent(s): 36ab61b

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +403 -0
  2. requirements.txt +11 -3
app.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ import io
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
8
+ import torch
9
+ import pickle
10
+ import os
11
+ import re
12
+ from typing import List, Tuple
13
+ import warnings
14
+ warnings.filterwarnings("ignore")
15
+
16
+ # Page config
17
+ st.set_page_config(
18
+ page_title="RAG PDF Chat Application",
19
+ page_icon="πŸ“š",
20
+ layout="wide"
21
+ )
22
+
23
+ class RAGSystem:
24
+ def __init__(self):
25
+ self.embedding_model = None
26
+ self.llm_pipeline = None
27
+ self.index = None
28
+ self.chunks = []
29
+ self.embeddings = None
30
+
31
+ @st.cache_resource
32
+ def load_embedding_model(_self):
33
+ """Load sentence transformer model"""
34
+ try:
35
+ model = SentenceTransformer('all-MiniLM-L6-v2')
36
+ return model
37
+ except Exception as e:
38
+ st.error(f"Error loading embedding model: {str(e)}")
39
+ return None
40
+
41
+ @st.cache_resource
42
+ def load_llm_model(_self):
43
+ """Load Hugging Face LLM"""
44
+ try:
45
+ # Better models for Q&A tasks - choose one based on your system
46
+
47
+ # Option 1: Google's Flan-T5 (Best for Q&A, lightweight)
48
+ model_name = "google/flan-t5-base" # 250M parameters
49
+
50
+ # Option 2: For more powerful responses (if you have good hardware)
51
+ # model_name = "google/flan-t5-large" # 780M parameters
52
+
53
+ # Option 3: Microsoft's DialoGPT (conversational)
54
+ # model_name = "microsoft/DialoGPT-small" # 117M parameters
55
+
56
+ # Option 4: Facebook's BART (good for summarization + Q&A)
57
+ # model_name = "facebook/bart-base"
58
+
59
+ # Load tokenizer and pipeline
60
+ if "flan-t5" in model_name:
61
+ # Text-to-text generation for Flan-T5
62
+ pipeline_obj = pipeline(
63
+ "text2text-generation",
64
+ model=model_name,
65
+ max_length=512,
66
+ temperature=0.7,
67
+ do_sample=True,
68
+ device=0 if torch.cuda.is_available() else -1
69
+ )
70
+ else:
71
+ # Text generation for other models
72
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
73
+ if tokenizer.pad_token is None:
74
+ tokenizer.pad_token = tokenizer.eos_token
75
+
76
+ pipeline_obj = pipeline(
77
+ "text-generation",
78
+ model=model_name,
79
+ tokenizer=tokenizer,
80
+ max_length=512,
81
+ temperature=0.7,
82
+ do_sample=True,
83
+ device=0 if torch.cuda.is_available() else -1
84
+ )
85
+ return pipeline_obj
86
+ except Exception as e:
87
+ st.error(f"Error loading LLM: {str(e)}")
88
+ return None
89
+
90
+ def extract_text_from_pdf(self, pdf_file) -> str:
91
+ """Extract text from uploaded PDF"""
92
+ try:
93
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
94
+ text = ""
95
+ for page in pdf_reader.pages:
96
+ text += page.extract_text() + "\n"
97
+ return text
98
+ except Exception as e:
99
+ st.error(f"Error extracting text from PDF: {str(e)}")
100
+ return ""
101
+
102
+ def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
103
+ """Split text into overlapping chunks"""
104
+ # Clean the text
105
+ text = re.sub(r'\s+', ' ', text.strip())
106
+
107
+ # Split into sentences
108
+ sentences = re.split(r'[.!?]+', text)
109
+
110
+ chunks = []
111
+ current_chunk = ""
112
+
113
+ for sentence in sentences:
114
+ sentence = sentence.strip()
115
+ if not sentence:
116
+ continue
117
+
118
+ # If adding this sentence would exceed chunk size, save current chunk
119
+ if len(current_chunk) + len(sentence) > chunk_size and current_chunk:
120
+ chunks.append(current_chunk.strip())
121
+ # Start new chunk with overlap
122
+ words = current_chunk.split()
123
+ overlap_text = ' '.join(words[-overlap:]) if len(words) > overlap else current_chunk
124
+ current_chunk = overlap_text + " " + sentence
125
+ else:
126
+ current_chunk += " " + sentence if current_chunk else sentence
127
+
128
+ # Add the last chunk
129
+ if current_chunk.strip():
130
+ chunks.append(current_chunk.strip())
131
+
132
+ return chunks
133
+
134
+ def create_embeddings(self, chunks: List[str]) -> np.ndarray:
135
+ """Generate embeddings for text chunks"""
136
+ if self.embedding_model is None:
137
+ self.embedding_model = self.load_embedding_model()
138
+
139
+ if self.embedding_model is None:
140
+ return None
141
+
142
+ try:
143
+ embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
144
+ return embeddings
145
+ except Exception as e:
146
+ st.error(f"Error creating embeddings: {str(e)}")
147
+ return None
148
+
149
+ def create_vector_store(self, embeddings: np.ndarray):
150
+ """Create FAISS vector store"""
151
+ try:
152
+ dimension = embeddings.shape[1]
153
+ index = faiss.IndexFlatIP(dimension) # Inner product similarity
154
+
155
+ # Normalize embeddings for cosine similarity
156
+ faiss.normalize_L2(embeddings)
157
+ index.add(embeddings.astype('float32'))
158
+
159
+ return index
160
+ except Exception as e:
161
+ st.error(f"Error creating vector store: {str(e)}")
162
+ return None
163
+
164
+ def search_similar_chunks(self, query: str, k: int = 3) -> List[Tuple[str, float]]:
165
+ """Search for similar chunks using vector similarity"""
166
+ if self.embedding_model is None or self.index is None:
167
+ return []
168
+
169
+ try:
170
+ # Generate query embedding
171
+ query_embedding = self.embedding_model.encode([query])
172
+ faiss.normalize_L2(query_embedding)
173
+
174
+ # Search in vector store
175
+ scores, indices = self.index.search(query_embedding.astype('float32'), k)
176
+
177
+ results = []
178
+ for idx, score in zip(indices[0], scores[0]):
179
+ if idx < len(self.chunks):
180
+ results.append((self.chunks[idx], float(score)))
181
+
182
+ return results
183
+ except Exception as e:
184
+ st.error(f"Error searching chunks: {str(e)}")
185
+ return []
186
+
187
+ def generate_answer(self, query: str, context_chunks: List[str]) -> str:
188
+ """Generate answer using LLM with context"""
189
+ if self.llm_pipeline is None:
190
+ self.llm_pipeline = self.load_llm_model()
191
+
192
+ if self.llm_pipeline is None:
193
+ return "Sorry, LLM model is not available."
194
+
195
+ try:
196
+ # Combine context
197
+ context = "\n".join(context_chunks[:2]) # Use top 2 chunks to avoid token limit
198
+
199
+ # Different prompts for different model types
200
+ model_name = getattr(self.llm_pipeline.model, 'name_or_path', 'unknown')
201
+
202
+ if "flan-t5" in model_name.lower():
203
+ # For Flan-T5 (text2text-generation)
204
+ prompt = f"Answer the question based on the context.\n\nContext: {context}\n\nQuestion: {query}\n\nAnswer:"
205
+
206
+ response = self.llm_pipeline(
207
+ prompt,
208
+ max_length=200,
209
+ num_return_sequences=1,
210
+ temperature=0.7,
211
+ do_sample=True
212
+ )
213
+ answer = response[0]['generated_text'].strip()
214
+
215
+ else:
216
+ # For GPT-style models (text-generation)
217
+ prompt = f"""Based on the following context, answer the question:
218
+
219
+ Context: {context}
220
+
221
+ Question: {query}
222
+
223
+ Answer:"""
224
+
225
+ response = self.llm_pipeline(
226
+ prompt,
227
+ max_length=len(prompt.split()) + 100,
228
+ num_return_sequences=1,
229
+ temperature=0.7,
230
+ do_sample=True,
231
+ pad_token_id=self.llm_pipeline.tokenizer.eos_token_id
232
+ )
233
+
234
+ # Extract the generated answer
235
+ generated_text = response[0]['generated_text']
236
+ answer = generated_text[len(prompt):].strip()
237
+
238
+ return answer if answer else "I couldn't find a specific answer in the provided context."
239
+
240
+ except Exception as e:
241
+ st.error(f"Error generating answer: {str(e)}")
242
+ return "Sorry, I encountered an error while generating the answer."
243
+
244
+ # Initialize RAG system
245
+ @st.cache_resource
246
+ def get_rag_system():
247
+ return RAGSystem()
248
+
249
+ # Main app
250
+ def main():
251
+ st.title("RAG PDF Chat Application")
252
+ st.markdown("Upload a PDF and chat with its contents using AI!")
253
+
254
+ # Initialize RAG system
255
+ rag = get_rag_system()
256
+
257
+ # Sidebar for PDF upload and processing
258
+ with st.sidebar:
259
+ st.header("Document Processing")
260
+
261
+ uploaded_file = st.file_uploader(
262
+ "Upload a PDF file",
263
+ type=['pdf'],
264
+ help="Upload a PDF document to create embeddings and chat with it"
265
+ )
266
+
267
+ if uploaded_file is not None:
268
+ st.success(f"Uploaded: {uploaded_file.name}")
269
+
270
+ if st.button("Process PDF", type="primary"):
271
+ with st.spinner("Processing PDF... This may take a few minutes"):
272
+
273
+ # Extract text
274
+ st.info("Extracting text from PDF...")
275
+ text = rag.extract_text_from_pdf(uploaded_file)
276
+
277
+ if text:
278
+ st.success(f"Extracted {len(text)} characters")
279
+
280
+ # Chunk text
281
+ st.info("Splitting text into chunks...")
282
+ rag.chunks = rag.chunk_text(text)
283
+ st.success(f"Created {len(rag.chunks)} chunks")
284
+
285
+ # Create embeddings
286
+ st.info("Generating embeddings...")
287
+ rag.embeddings = rag.create_embeddings(rag.chunks)
288
+
289
+ if rag.embeddings is not None:
290
+ st.success(f"Generated embeddings: {rag.embeddings.shape}")
291
+
292
+ # Create vector store
293
+ st.info("Creating vector store...")
294
+ rag.index = rag.create_vector_store(rag.embeddings)
295
+
296
+ if rag.index is not None:
297
+ st.success("PDF processed successfully!")
298
+ st.session_state['pdf_processed'] = True
299
+ else:
300
+ st.error("Failed to create vector store")
301
+ else:
302
+ st.error("Failed to generate embeddings")
303
+ else:
304
+ st.error("Failed to extract text from PDF")
305
+
306
+ # Display processing status
307
+ if 'pdf_processed' in st.session_state:
308
+ st.success("PDF Ready for Chat!")
309
+
310
+ # Model info
311
+ st.header("Model Information")
312
+ st.info("""
313
+ **Embedding Model**: all-MiniLM-L6-v2 (384 dim)
314
+ **LLM Model**: google/flan-t5-base (250M params)
315
+ **Vector Store**: FAISS with cosine similarity
316
+
317
+ **Alternative Models Available:**
318
+ - google/flan-t5-large (better quality)
319
+ - microsoft/DialoGPT-small (conversational)
320
+ - facebook/bart-base (summarization focus)
321
+ """)
322
+
323
+ # Main chat interface
324
+ if 'pdf_processed' in st.session_state and st.session_state['pdf_processed']:
325
+ st.header("Chat with your PDF")
326
+
327
+ # Initialize chat history
328
+ if 'messages' not in st.session_state:
329
+ st.session_state.messages = []
330
+
331
+ # Display chat history
332
+ for message in st.session_state.messages:
333
+ with st.chat_message(message["role"]):
334
+ st.markdown(message["content"])
335
+ if "sources" in message:
336
+ with st.expander("View Sources"):
337
+ for i, source in enumerate(message["sources"], 1):
338
+ st.markdown(f"**Source {i}:**")
339
+ st.text(source)
340
+
341
+ # Chat input
342
+ if prompt := st.chat_input("Ask a question about your PDF..."):
343
+ # Add user message
344
+ st.session_state.messages.append({"role": "user", "content": prompt})
345
+
346
+ with st.chat_message("user"):
347
+ st.markdown(prompt)
348
+
349
+ # Generate response
350
+ with st.chat_message("assistant"):
351
+ with st.spinner("Searching and generating answer..."):
352
+
353
+ # Search for relevant chunks
354
+ similar_chunks = rag.search_similar_chunks(prompt, k=3)
355
+
356
+ if similar_chunks:
357
+ # Extract context
358
+ context_chunks = [chunk for chunk, score in similar_chunks]
359
+
360
+ # Generate answer
361
+ answer = rag.generate_answer(prompt, context_chunks)
362
+
363
+ st.markdown(answer)
364
+
365
+ # Show sources
366
+ with st.expander("View Sources"):
367
+ for i, (chunk, score) in enumerate(similar_chunks, 1):
368
+ st.markdown(f"**Source {i} (Similarity: {score:.3f}):**")
369
+ st.text(chunk[:500] + "..." if len(chunk) > 500 else chunk)
370
+
371
+ # Add assistant message with sources
372
+ st.session_state.messages.append({
373
+ "role": "assistant",
374
+ "content": answer,
375
+ "sources": context_chunks
376
+ })
377
+ else:
378
+ error_msg = "Sorry, I couldn't find relevant information to answer your question."
379
+ st.markdown(error_msg)
380
+ st.session_state.messages.append({"role": "assistant", "content": error_msg})
381
+
382
+ else:
383
+ # Instructions when no PDF is processed
384
+ st.header(" ****Getting Started****")
385
+ st.markdown("""
386
+ ### Welcome to the RAG PDF Chat Application!
387
+
388
+ **Steps to use:**
389
+ 1. πŸ“„ Upload a PDF file using the sidebar
390
+ 2. πŸ”„ Click "Process PDF" to create embeddings
391
+ 3. πŸ’¬ Start chatting with your document!
392
+
393
+ **Features:**
394
+ - 🧠 AI-powered document understanding
395
+ - πŸ” Semantic search through your PDF
396
+ - πŸ“š Source citations for transparency
397
+ - ⚑ Fast vector-based retrieval
398
+
399
+ **Note:** First time loading may take a few minutes to download models.
400
+ """)
401
+
402
+ if __name__ == "__main__":
403
+ main()
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ PyPDF2>=3.0.1
3
+ sentence-transformers>=2.2.2
4
+ faiss-cpu>=1.7.4
5
+ transformers>=4.30.0
6
+ torch>=2.0.0
7
+ numpy>=1.24.0
8
+ scikit-learn>=1.3.0
9
+ pandas>=2.0.0
10
+ accelerate>=0.20.0
11
+ sentencepiece>=0.1.99