Shreyas094 commited on
Commit
0b862cc
·
verified ·
1 Parent(s): 5f7b9cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +291 -76
app.py CHANGED
@@ -8,34 +8,145 @@ import os
8
  import logging
9
  import traceback
10
  from datetime import datetime
 
 
11
 
12
  # Configure logging
13
  logging.basicConfig(
14
  level=logging.DEBUG,
15
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
16
  handlers=[
17
- logging.FileHandler(f'rag_app_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
18
  logging.StreamHandler()
19
  ]
20
  )
21
  logger = logging.getLogger(__name__)
22
 
23
- class RAGApplication:
24
- def __init__(self, hf_api_key):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
  self.hf_api_key = hf_api_key
27
- self.vector_store = None
 
 
 
28
  logger.info("Initializing HuggingFace embeddings...")
29
  self.embeddings = HuggingFaceInferenceAPIEmbeddings(
30
  api_key=hf_api_key,
31
  model_name="sentence-transformers/all-MiniLM-L6-v2"
32
  )
 
33
  logger.info("Initializing HuggingFace client...")
34
  self.client = InferenceClient(api_key=hf_api_key)
35
  self.conversation_history = []
36
- logger.info("RAGApplication initialized successfully")
 
 
 
 
 
37
  except Exception as e:
38
- logger.error(f"Error initializing RAGApplication: {str(e)}")
39
  logger.error(f"Traceback: {traceback.format_exc()}")
40
  raise
41
 
@@ -46,11 +157,10 @@ class RAGApplication:
46
  4. Use concise language and avoid unnecessary elaboration
47
  5. Maintain continuity with previous conversation when relevant
48
 
49
- Remember:
50
- - Keep responses to three sentences maximum
51
- - Focus only on information present in the context
52
- - If unsure, explicitly state that the information is not in the context
53
- - Ensure responses are clear and directly address the question
54
 
55
  Context: {context}
56
 
@@ -60,91 +170,197 @@ Previous conversation:
60
  Question: {question}
61
 
62
  Answer:"""
63
-
64
- def process_pdf(self, file_path):
65
  try:
66
- logger.info(f"Starting PDF processing for file: {file_path}")
67
-
68
- if file_path is None:
69
- logger.warning("No file provided")
70
- return "Please upload a PDF file."
71
 
72
- if not os.path.exists(file_path):
73
- logger.error(f"File not found at path: {file_path}")
74
- return f"File not found: {file_path}"
75
 
76
- # Reset conversation history when new PDF is loaded
77
  self.conversation_history = []
78
- logger.info("Conversation history reset")
79
-
80
- # Read PDF directly from the file path
81
- logger.info("Reading PDF file...")
82
  pdf_reader = PdfReader(file_path)
83
- text = ""
84
- for i, page in enumerate(pdf_reader.pages):
85
- try:
86
- text += page.extract_text()
87
- logger.debug(f"Extracted text from page {i+1}")
88
- except Exception as e:
89
- logger.error(f"Error extracting text from page {i+1}: {str(e)}")
90
-
91
- if not text.strip():
92
- logger.warning("No text extracted from PDF")
93
- return "No text could be extracted from the PDF. Please make sure it's not empty or scanned."
94
-
95
- # Split text into chunks
96
- logger.info("Splitting text into chunks...")
97
- text_splitter = RecursiveCharacterTextSplitter(
98
- chunk_size=10000,
99
- chunk_overlap=2000,
100
- length_function=len
101
  )
102
- chunks = text_splitter.split_text(text)
103
- logger.info(f"Created {len(chunks)} chunks")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- if not chunks:
106
- logger.warning("No chunks created from text")
107
- return "No chunks were created. The PDF might be empty."
 
108
 
109
- # Create vector store
110
- logger.info("Creating vector store...")
111
- self.vector_store = FAISS.from_texts(chunks, self.embeddings)
112
- logger.info("Vector store created successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- return "PDF processed successfully! You can now ask questions about it."
115
- except Exception as e:
116
- error_msg = f"Error processing PDF: {str(e)}"
117
- logger.error(error_msg)
118
- logger.error(f"Traceback: {traceback.format_exc()}")
119
- return error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- def generate_response(self, message, history):
122
  try:
123
  logger.info(f"Generating response for message: {message}")
124
 
125
- if self.vector_store is None:
126
- logger.warning("No vector store available - PDF not processed")
127
  return "Please upload and process a PDF first."
128
 
129
  query = message.strip()
130
  if not query:
131
- logger.warning("Empty query received")
132
  return "Please enter a question."
133
 
134
- # Search for relevant chunks
135
- logger.info("Searching for relevant chunks...")
136
- relevant_chunks = self.vector_store.similarity_search(query, k=3)
137
- context = "\n\n".join([doc.page_content for doc in relevant_chunks])
138
- logger.debug(f"Found {len(relevant_chunks)} relevant chunks")
139
-
140
  # Format conversation history
141
- logger.debug(f"Processing conversation history (length: {len(history)})")
142
  conversation_history = "\n".join([
143
  f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a
144
  ])
145
 
146
- # Create prompt with system prompt, context, and conversation history
147
- logger.debug("Creating prompt...")
148
  prompt = self.system_prompt.format(
149
  context=context,
150
  conversation_history=conversation_history,
@@ -177,15 +393,14 @@ Answer:"""
177
  logger.error(f"Traceback: {traceback.format_exc()}")
178
  return error_msg
179
 
180
- # Create Gradio interface
181
  def create_gradio_interface():
182
  try:
183
  logger.info("Creating Gradio interface...")
184
  api_key = os.getenv("HF_API_KEY")
185
- rag = RAGApplication(hf_api_key=api_key)
186
 
187
  with gr.Blocks() as demo:
188
- gr.Markdown("# PDF Question Answering System")
189
 
190
  with gr.Row():
191
  pdf_input = gr.File(
@@ -209,7 +424,7 @@ def create_gradio_interface():
209
  theme="soft",
210
  examples=[
211
  "What is the main topic of this document?",
212
- "Can you summarize the key points?",
213
  "What are the main conclusions?",
214
  ],
215
  )
 
8
  import logging
9
  import traceback
10
  from datetime import datetime
11
+ from typing import List, Dict, Tuple, Any
12
+ import re
13
 
14
  # Configure logging
15
  logging.basicConfig(
16
  level=logging.DEBUG,
17
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
18
  handlers=[
19
+ logging.FileHandler(f'enhanced_rag_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
20
  logging.StreamHandler()
21
  ]
22
  )
23
  logger = logging.getLogger(__name__)
24
 
25
+ class TextPreprocessor:
26
+ @staticmethod
27
+ def clean_text(text: str) -> str:
28
+ """Clean and normalize text content."""
29
+ # Remove multiple spaces
30
+ text = re.sub(r'\s+', ' ', text)
31
+ # Remove multiple newlines
32
+ text = re.sub(r'\n\s*\n', '\n\n', text)
33
+ # Normalize quotes
34
+ text = re.sub(r'[""']', '"', text)
35
+ # Remove header/footer artifacts
36
+ text = re.sub(r'^.*Page \d+.*$', '', text, flags=re.MULTILINE)
37
+ return text.strip()
38
+
39
+ @staticmethod
40
+ def extract_section_headers(text: str) -> List[str]:
41
+ """Extract potential section headers from text."""
42
+ # Simple header detection (can be enhanced based on document structure)
43
+ header_pattern = r'^(?:[A-Z][A-Za-z\s]{2,50}:?|(?:\d+\.){1,3}\s+[A-Z][A-Za-z\s]{2,50})$'
44
+ headers = re.findall(header_pattern, text, re.MULTILINE)
45
+ return headers
46
+
47
+ def create_page_chunks(pdf_reader: PdfReader) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
48
+ """
49
+ Creates both page-level and semantic chunks from PDF content.
50
+ """
51
+ page_chunks = []
52
+ semantic_chunks = []
53
+ preprocessor = TextPreprocessor()
54
+
55
+ # Configure text splitters
56
+ semantic_splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size=1000,
58
+ chunk_overlap=200,
59
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
60
+ length_function=len
61
+ )
62
+
63
+ # Sliding window parameters
64
+ window_size = 2000
65
+ window_overlap = 500
66
+
67
+ for page_num, page in enumerate(pdf_reader.pages, 1):
68
+ try:
69
+ page_text = page.extract_text()
70
+ if not page_text.strip():
71
+ continue
72
+
73
+ # Clean and preprocess text
74
+ cleaned_text = preprocessor.clean_text(page_text)
75
+ headers = preprocessor.extract_section_headers(cleaned_text)
76
+
77
+ # Store full page as a chunk
78
+ page_chunks.append({
79
+ "content": cleaned_text,
80
+ "metadata": {
81
+ "page_num": page_num,
82
+ "chunk_type": "full_page",
83
+ "section_headers": headers
84
+ }
85
+ })
86
+
87
+ # Create semantic chunks
88
+ semantic_page_chunks = semantic_splitter.split_text(cleaned_text)
89
+
90
+ # Create sliding windows for long content
91
+ if len(cleaned_text) > window_size:
92
+ start = 0
93
+ while start < len(cleaned_text):
94
+ window_text = cleaned_text[start:start + window_size]
95
+ semantic_chunks.append({
96
+ "content": window_text,
97
+ "metadata": {
98
+ "page_num": page_num,
99
+ "chunk_type": "sliding_window",
100
+ "window_start": start,
101
+ "section_headers": headers
102
+ }
103
+ })
104
+ start += (window_size - window_overlap)
105
+
106
+ # Add regular semantic chunks
107
+ for chunk_num, chunk in enumerate(semantic_page_chunks):
108
+ semantic_chunks.append({
109
+ "content": chunk,
110
+ "metadata": {
111
+ "page_num": page_num,
112
+ "chunk_num": chunk_num,
113
+ "chunk_type": "semantic",
114
+ "total_chunks": len(semantic_page_chunks),
115
+ "section_headers": headers
116
+ }
117
+ })
118
+
119
+ except Exception as e:
120
+ logger.error(f"Error processing page {page_num}: {str(e)}")
121
+ continue
122
+
123
+ return page_chunks, semantic_chunks
124
+
125
+ class EnhancedRAGApplication:
126
+ def __init__(self, hf_api_key: str):
127
  try:
128
  self.hf_api_key = hf_api_key
129
+ self.page_store = None
130
+ self.semantic_store = None
131
+ self.sliding_store = None
132
+
133
  logger.info("Initializing HuggingFace embeddings...")
134
  self.embeddings = HuggingFaceInferenceAPIEmbeddings(
135
  api_key=hf_api_key,
136
  model_name="sentence-transformers/all-MiniLM-L6-v2"
137
  )
138
+
139
  logger.info("Initializing HuggingFace client...")
140
  self.client = InferenceClient(api_key=hf_api_key)
141
  self.conversation_history = []
142
+
143
+ # Initialize cache
144
+ self.chunk_cache = {}
145
+ self.query_cache = {}
146
+
147
+ logger.info("EnhancedRAGApplication initialized successfully")
148
  except Exception as e:
149
+ logger.error(f"Error initializing EnhancedRAGApplication: {str(e)}")
150
  logger.error(f"Traceback: {traceback.format_exc()}")
151
  raise
152
 
 
157
  4. Use concise language and avoid unnecessary elaboration
158
  5. Maintain continuity with previous conversation when relevant
159
 
160
+ Context structure:
161
+ - Full page chunks provide complete context
162
+ - Semantic chunks provide focused information
163
+ - Sliding windows maintain context across chunk boundaries
 
164
 
165
  Context: {context}
166
 
 
170
  Question: {question}
171
 
172
  Answer:"""
173
+
174
+ def process_pdf(self, file_path: str) -> str:
175
  try:
176
+ logger.info(f"Starting enhanced PDF processing for file: {file_path}")
 
 
 
 
177
 
178
+ if file_path is None or not os.path.exists(file_path):
179
+ return "Please upload a valid PDF file."
 
180
 
181
+ # Reset conversation history and caches
182
  self.conversation_history = []
183
+ self.chunk_cache = {}
184
+ self.query_cache = {}
185
+
 
186
  pdf_reader = PdfReader(file_path)
187
+
188
+ # Create chunks
189
+ page_chunks, semantic_chunks = create_page_chunks(pdf_reader)
190
+
191
+ # Create vector stores
192
+ logger.info("Creating vector stores...")
193
+ self.page_store = FAISS.from_texts(
194
+ [chunk["content"] for chunk in page_chunks],
195
+ self.embeddings,
196
+ metadatas=[chunk["metadata"] for chunk in page_chunks]
197
+ )
198
+
199
+ self.semantic_store = FAISS.from_texts(
200
+ [chunk["content"] for chunk in semantic_chunks if chunk["metadata"]["chunk_type"] == "semantic"],
201
+ self.embeddings,
202
+ metadatas=[chunk["metadata"] for chunk in semantic_chunks if chunk["metadata"]["chunk_type"] == "semantic"]
 
 
203
  )
204
+
205
+ self.sliding_store = FAISS.from_texts(
206
+ [chunk["content"] for chunk in semantic_chunks if chunk["metadata"]["chunk_type"] == "sliding_window"],
207
+ self.embeddings,
208
+ metadatas=[chunk["metadata"] for chunk in semantic_chunks if chunk["metadata"]["chunk_type"] == "sliding_window"]
209
+ )
210
+
211
+ logger.info("Vector stores created successfully")
212
+ return "PDF processed successfully with enhanced chunking!"
213
+
214
+ except Exception as e:
215
+ logger.error(f"Error in enhanced PDF processing: {str(e)}")
216
+ return f"Error processing PDF: {str(e)}"
217
+
218
+ def mmr_reranking(self, results: List[Dict], lambda_param: float = 0.5, num_results: int = 3) -> List[Dict]:
219
+ """
220
+ Rerank results using Maximum Marginal Relevance to ensure diversity.
221
+ """
222
+ if len(results) <= num_results:
223
+ return results
224
+
225
+ selected = [results[0]] # Start with highest scored result
226
+ remaining = results[1:]
227
+
228
+ while len(selected) < num_results and remaining:
229
+ max_mmr_score = -1
230
+ max_mmr_idx = -1
231
+
232
+ for i, result in enumerate(remaining):
233
+ # Calculate similarity term
234
+ similarity_score = result["score"]
235
+
236
+ # Calculate diversity term
237
+ diversity_scores = [1 - self._calculate_similarity(result["content"], s["content"])
238
+ for s in selected]
239
+ diversity_score = min(diversity_scores)
240
+
241
+ # Calculate MMR score
242
+ mmr_score = lambda_param * similarity_score + (1 - lambda_param) * diversity_score
243
+
244
+ if mmr_score > max_mmr_score:
245
+ max_mmr_score = mmr_score
246
+ max_mmr_idx = i
247
 
248
+ if max_mmr_idx != -1:
249
+ selected.append(remaining.pop(max_mmr_idx))
250
+ else:
251
+ break
252
 
253
+ return selected
254
+
255
+ def _calculate_similarity(self, text1: str, text2: str) -> float:
256
+ """
257
+ Calculate similarity between two texts using embeddings.
258
+ """
259
+ try:
260
+ emb1 = self.embeddings.embed_query(text1)
261
+ emb2 = self.embeddings.embed_query(text2)
262
+ return sum(a * b for a, b in zip(emb1, emb2))
263
+ except:
264
+ return 0
265
+
266
+ def hybrid_retrieval(self, query: str, k_semantic: int = 3, k_pages: int = 1) -> str:
267
+ """
268
+ Performs hybrid retrieval using semantic, page-level, and sliding window chunks.
269
+ """
270
+ # Check query cache
271
+ cache_key = f"{query}_{k_semantic}_{k_pages}"
272
+ if cache_key in self.query_cache:
273
+ return self.query_cache[cache_key]
274
+
275
+ results = []
276
+
277
+ # Get relevant semantic chunks
278
+ semantic_results = self.semantic_store.similarity_search_with_score(
279
+ query, k=k_semantic
280
+ )
281
+
282
+ # Get relevant full pages
283
+ page_results = self.page_store.similarity_search_with_score(
284
+ query, k=k_pages
285
+ )
286
+
287
+ # Get relevant sliding windows
288
+ sliding_results = self.sliding_store.similarity_search_with_score(
289
+ query, k=k_semantic
290
+ )
291
+
292
+ # Combine all results
293
+ all_results = []
294
+
295
+ for doc, score in semantic_results:
296
+ all_results.append({
297
+ "content": doc.page_content,
298
+ "metadata": doc.metadata,
299
+ "score": score,
300
+ "type": "semantic"
301
+ })
302
 
303
+ for doc, score in page_results:
304
+ all_results.append({
305
+ "content": doc.page_content,
306
+ "metadata": doc.metadata,
307
+ "score": score,
308
+ "type": "page"
309
+ })
310
+
311
+ for doc, score in sliding_results:
312
+ all_results.append({
313
+ "content": doc.page_content,
314
+ "metadata": doc.metadata,
315
+ "score": score,
316
+ "type": "sliding_window"
317
+ })
318
+
319
+ # Apply MMR reranking
320
+ reranked_results = self.mmr_reranking(all_results)
321
+
322
+ # Combine context while preserving document structure
323
+ context = []
324
+ for result in reranked_results:
325
+ context_str = f"[Page {result['metadata']['page_num']}"
326
+
327
+ if result['type'] == "semantic":
328
+ context_str += f", Chunk {result['metadata']['chunk_num']}"
329
+ elif result['type'] == "sliding_window":
330
+ context_str += f", Window {result['metadata']['window_start']}"
331
+
332
+ if result['metadata'].get('section_headers'):
333
+ context_str += f", Section: {result['metadata']['section_headers'][0]}"
334
+
335
+ context_str += f"]: {result['content']}"
336
+ context.append(context_str)
337
+
338
+ final_context = "\n\n".join(context)
339
+
340
+ # Cache the result
341
+ self.query_cache[cache_key] = final_context
342
+ return final_context
343
 
344
+ def generate_response(self, message: str, history: List[Tuple[str, str]]) -> str:
345
  try:
346
  logger.info(f"Generating response for message: {message}")
347
 
348
+ if not any([self.page_store, self.semantic_store, self.sliding_store]):
 
349
  return "Please upload and process a PDF first."
350
 
351
  query = message.strip()
352
  if not query:
 
353
  return "Please enter a question."
354
 
355
+ # Get relevant context using hybrid retrieval
356
+ context = self.hybrid_retrieval(query)
357
+
 
 
 
358
  # Format conversation history
 
359
  conversation_history = "\n".join([
360
  f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a
361
  ])
362
 
363
+ # Create prompt
 
364
  prompt = self.system_prompt.format(
365
  context=context,
366
  conversation_history=conversation_history,
 
393
  logger.error(f"Traceback: {traceback.format_exc()}")
394
  return error_msg
395
 
 
396
  def create_gradio_interface():
397
  try:
398
  logger.info("Creating Gradio interface...")
399
  api_key = os.getenv("HF_API_KEY")
400
+ rag = EnhancedRAGApplication(hf_api_key=api_key)
401
 
402
  with gr.Blocks() as demo:
403
+ gr.Markdown("# Enhanced PDF Question Answering System")
404
 
405
  with gr.Row():
406
  pdf_input = gr.File(
 
424
  theme="soft",
425
  examples=[
426
  "What is the main topic of this document?",
427
+ "Can you summarize the key points
428
  "What are the main conclusions?",
429
  ],
430
  )