rohannsinghal commited on
Commit
6179555
Β·
1 Parent(s): ac1208e

changes to main_api.py files making it more robust and better

Browse files
Files changed (2) hide show
  1. app/main_api.py +853 -657
  2. requirements.txt +6 -1
app/main_api.py CHANGED
@@ -1,14 +1,18 @@
1
- # --- OPTIMIZED SEMANTIC RAG SYSTEM ---
2
 
3
  import os
4
  import json
5
  import uuid
6
  import time
7
  import re
8
- from typing import List, Dict, Any, Optional
9
- import logging
10
  import asyncio
 
 
11
  from collections import defaultdict
 
 
 
 
12
 
13
  # FastAPI and core dependencies
14
  from fastapi import FastAPI, Body, HTTPException, Request, Depends, Header
@@ -24,782 +28,974 @@ from langchain.llms.base import LLM
24
  from langchain.callbacks.manager import CallbackManagerForLLMRun
25
  from langchain.schema.document import Document as LangChainDocument
26
 
27
- # Document processing imports
28
  import fitz # PyMuPDF
29
  import pdfplumber
30
-
31
- # LLM Integration
 
 
 
 
 
 
 
 
 
 
32
  import groq
 
 
 
 
33
  import httpx
34
  from dotenv import load_dotenv
 
 
 
35
 
36
  # Setup
37
  load_dotenv()
38
  logging.basicConfig(level=logging.INFO)
39
  logger = logging.getLogger(__name__)
40
 
41
- app = FastAPI(title="Optimized Semantic RAG", version="2.1.0")
42
 
43
- # Updated CORS middleware for hackathon
44
  app.add_middleware(
45
  CORSMiddleware,
46
- allow_origins=["*"],
47
- allow_credentials=True,
48
- allow_methods=["GET", "POST"],
49
- allow_headers=["*", "Authorization", "Content-Type"],
50
  )
51
 
52
- # --- AUTHENTICATION MIDDLEWARE ---
53
 
54
- async def verify_bearer_token(authorization: str = Header(None)):
55
- """Verify Bearer token authentication as required by hackathon"""
56
- if not authorization:
57
- raise HTTPException(status_code=401, detail="Authorization header required")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- if not authorization.startswith("Bearer "):
60
- raise HTTPException(status_code=401, detail="Invalid authorization format. Use 'Bearer <token>'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- token = authorization.replace("Bearer ", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- # Basic token validation
65
- if len(token) < 10:
66
- raise HTTPException(status_code=401, detail="Invalid token format")
 
 
 
 
 
 
 
 
 
67
 
68
- # For hackathon, we accept any properly formatted Bearer token
69
- # In production, you would validate against a specific token or database
70
- logger.info(f"βœ… Authentication successful with token: {token[:10]}...")
71
- return token
72
-
73
- # --- OPTIMIZED SEMANTIC DOCUMENT PARSER ---
74
-
75
- class DocumentChunk:
76
- def __init__(self, content: str, metadata: Dict[str, Any], chunk_id: str):
77
- self.content = content
78
- self.metadata = metadata
79
- self.chunk_id = chunk_id
 
 
 
 
 
 
 
80
 
81
- def to_dict(self):
82
- return {
83
- "content": self.content,
84
- "metadata": self.metadata,
85
- "chunk_id": self.chunk_id
86
- }
87
 
88
- class OptimizedSemanticParser:
89
  def __init__(self):
90
- # Optimized parameters - balanced between quality and performance
91
  self.chunk_size = 1200
92
- self.chunk_overlap = 180
93
- self.max_chunks = 200 # Sweet spot for memory vs coverage
94
- self.max_pages = 20 # Reduced from 30
95
- logger.info("OptimizedSemanticParser initialized")
96
-
97
- def semantic_text_split(self, text: str, source: str) -> List[str]:
98
- """Optimized semantic text splitting - keeps intelligence while being efficient"""
99
- if not text or len(text) < 100:
100
- return [text] if text else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- chunks = []
 
 
 
 
 
 
 
 
103
 
104
- # Semantic boundary patterns (optimized list)
105
- semantic_patterns = [
106
- r'\n\s*(?:\d+\.)+\s*[A-Z][^.\n]*[.:]', # Numbered sections
107
- r'\n\s*[A-Z][A-Z\s]{8,}[:\n]', # ALL CAPS HEADINGS
108
- r'\n\s*(?:EXCLUSIONS?|BENEFITS?|COVERAGE|DEFINITIONS?)', # Key sections
109
- r'\n\s*(?:WAITING\s+PERIOD|GRACE\s+PERIOD|CLAIMS?)', # Important terms
110
- ]
111
 
112
- # Find semantic boundaries efficiently
113
- boundaries = [0]
114
- for pattern in semantic_patterns:
115
- matches = re.finditer(pattern, text, re.IGNORECASE)
116
- boundaries.extend(match.start() for match in matches)
117
 
118
- boundaries.append(len(text))
119
- boundaries = sorted(set(boundaries))
120
 
121
- # Create semantic chunks
122
- for i in range(len(boundaries) - 1):
123
- section_start = boundaries[i]
124
- section_end = boundaries[i + 1]
125
- section_text = text[section_start:section_end].strip()
126
 
127
- if len(section_text) <= self.chunk_size:
128
- if section_text and len(section_text) > 80:
129
- chunks.append(section_text)
130
- else:
131
- # Split large sections intelligently
132
- sub_chunks = self._split_section_smartly(section_text)
133
- chunks.extend(sub_chunks)
134
-
135
- # Fallback to sentence-based splitting if no boundaries found
136
- if len(chunks) == 0:
137
- chunks = self._fallback_sentence_split(text)
138
-
139
- # Limit total chunks for memory management
140
- chunks = chunks[:self.max_chunks]
141
- logger.info(f"Split {source} into {len(chunks)} semantic chunks")
142
- return chunks
143
-
144
- def _split_section_smartly(self, text: str) -> List[str]:
145
- """Smart splitting for large sections"""
 
 
 
 
 
 
 
146
  chunks = []
147
- sentences = re.split(r'(?<=[.!?])\s+', text)
148
-
149
- current_chunk = ""
150
- for sentence in sentences:
151
- if len(current_chunk) + len(sentence) <= self.chunk_size:
152
- current_chunk += sentence + " "
153
- else:
154
- if current_chunk.strip():
155
- chunks.append(current_chunk.strip())
156
- current_chunk = sentence + " "
157
-
158
- if current_chunk.strip():
159
- chunks.append(current_chunk.strip())
160
 
161
- return chunks
162
-
163
- def _fallback_sentence_split(self, text: str) -> List[str]:
164
- """Fallback intelligent sentence-based splitting"""
165
- chunks = []
166
- sentences = re.split(r'(?<=[.!?])\s+', text)
167
 
168
- current_chunk = ""
169
- for sentence in sentences:
170
- if len(current_chunk) + len(sentence) <= self.chunk_size:
171
- current_chunk += sentence + " "
172
- else:
173
- if current_chunk.strip():
174
- chunks.append(current_chunk.strip())
175
- current_chunk = sentence + " "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- if current_chunk.strip():
178
- chunks.append(current_chunk.strip())
 
179
 
180
  return chunks
181
-
182
- def extract_semantic_tables(self, file_path: str) -> str:
183
- """Optimized semantic table extraction"""
184
  table_text = ""
185
- table_count = 0
186
- max_tables = 12 # Balanced number
187
-
188
  try:
189
  with pdfplumber.open(file_path) as pdf:
190
- # Process key pages only
191
- pages_to_process = list(range(min(len(pdf.pages), 18)))
192
-
193
- for page_num in pages_to_process:
194
- if table_count >= max_tables:
195
- break
196
-
197
- page = pdf.pages[page_num]
198
  tables = page.find_tables()
199
-
200
- for table in tables[:2]: # Max 2 tables per page for efficiency
201
- if table_count >= max_tables:
202
- break
203
-
204
  try:
205
  table_data = table.extract()
206
- if table_data and len(table_data) >= 2:
207
-
208
- # Semantic relevance check (optimized)
209
- table_str = str(table_data).lower()
210
- insurance_keywords = ['premium', 'coverage', 'benefit', 'waiting', 'exclusion',
211
- 'claim', 'limit', 'sum', 'medical', 'hospital']
212
-
213
- if any(keyword in table_str for keyword in insurance_keywords):
214
- # Skip administrative tables
215
- if not any(admin in table_str for admin in ['ombudsman', 'lalit bhawan']):
216
-
217
- # Format table efficiently
218
- table_md = f"\n**POLICY TABLE {table_count + 1} (Page {page_num + 1})**\n"
219
-
220
- # Limit rows for memory efficiency
221
- limited_data = table_data[:min(15, len(table_data))]
222
-
223
- for row in limited_data:
224
- if row:
225
- row_str = " | ".join(str(cell or "")[:40] for cell in row)
226
- table_md += f"| {row_str} |\n"
227
-
228
- table_text += table_md + "\n"
229
- table_count += 1
230
-
231
- except Exception:
232
  continue
233
-
234
- logger.info(f"Extracted {table_count} semantic tables")
235
-
236
  except Exception as e:
237
- logger.warning(f"Semantic table extraction failed: {e}")
238
-
239
  return table_text
240
-
241
- def process_pdf_optimized_semantic(self, file_path: str) -> List[DocumentChunk]:
242
- """Optimized semantic PDF processing - keeps intelligence while being memory efficient"""
243
- logger.info(f"πŸš€ Processing PDF with optimized semantics: {os.path.basename(file_path)}")
244
- start_time = time.time()
245
- chunks = []
246
-
247
  try:
248
- # Efficient text extraction
249
- doc = fitz.open(file_path)
250
  full_text = ""
251
- total_pages = len(doc)
252
 
253
- # Process optimized number of pages
254
- pages_to_process = list(range(min(total_pages, self.max_pages)))
 
 
255
 
256
- for page_num in pages_to_process:
257
- try:
258
- page = doc[page_num]
259
- page_text = page.get_text()
260
-
261
- # Intelligent content filtering
262
- lines = page_text.split('\n')
263
- filtered_lines = []
264
-
265
- for line in lines:
266
- line = line.strip()
267
- if (line and len(line) > 15 and
268
- not any(noise in line.lower() for noise in
269
- ['ombudsman', 'lalit bhawan', 'page ']) and
270
- not re.match(r'^\d+\s*$', line)): # Skip page numbers
271
- filtered_lines.append(line)
272
-
273
- clean_text = '\n'.join(filtered_lines)
274
- if clean_text and len(clean_text) > 100:
275
- full_text += f"\n\nPage {page_num + 1}:\n{clean_text}"
276
-
277
- except Exception:
278
- continue
279
-
280
- doc.close()
281
-
282
- # Add semantic tables
283
- table_content = self.extract_semantic_tables(file_path)
284
- if table_content:
285
- full_text += f"\n\n{'='*40}\nKEY POLICY TABLES\n{'='*40}\n{table_content}"
286
-
287
- # Create semantic chunks
288
- text_chunks = self.semantic_text_split(full_text, os.path.basename(file_path))
289
-
290
- # Create chunks with optimized semantic metadata
291
- for idx, chunk_text in enumerate(text_chunks):
292
- # Lightweight semantic analysis
293
- chunk_lower = chunk_text.lower()
 
294
 
295
- # Detect content types efficiently
296
- content_types = []
297
- type_indicators = {
298
- 'definitions': ['means', 'definition', 'shall mean'],
299
- 'coverage': ['coverage', 'covered', 'benefit'],
300
- 'exclusions': ['exclusion', 'excluded', 'not covered'],
301
- 'waiting_periods': ['waiting period', 'wait'],
302
- 'claims': ['claim', 'settlement'],
303
- 'premium': ['premium', 'payment', 'grace period'],
304
- 'medical': ['hospital', 'medical', 'treatment']
305
- }
306
 
307
- for content_type, indicators in type_indicators.items():
308
- if any(indicator in chunk_lower for indicator in indicators):
309
- content_types.append(content_type)
 
310
 
311
- # Calculate simple relevance score
312
- insurance_terms = ['policy', 'coverage', 'benefit', 'exclusion', 'claim', 'premium']
313
- relevance_score = sum(1 for term in insurance_terms if term in chunk_lower)
314
-
315
- chunks.append(DocumentChunk(
316
- content=chunk_text,
317
- metadata={
318
- "source": os.path.basename(file_path),
319
- "chunk_index": idx,
320
- "document_type": "optimized_semantic",
321
- "content_types": ", ".join(content_types) if content_types else "general",
322
- "total_pages": total_pages,
323
- "chunk_length": len(chunk_text),
324
- "relevance_score": relevance_score,
325
- "has_tables": "table" in chunk_text.lower()
326
- },
327
- chunk_id=str(uuid.uuid4())
328
- ))
329
-
330
- elapsed = time.time() - start_time
331
- logger.info(f"βœ… Optimized semantic processing complete in {elapsed:.2f}s: {len(chunks)} chunks")
332
- return chunks
333
-
334
  except Exception as e:
335
- logger.error(f"❌ Optimized semantic processing failed: {e}")
336
- return self._emergency_fallback(file_path)
337
-
338
- def _emergency_fallback(self, file_path: str) -> List[DocumentChunk]:
339
- """Emergency fallback that still maintains some intelligence"""
340
- logger.info("πŸ†˜ Emergency fallback with basic semantics")
 
 
 
 
 
 
341
  try:
342
- doc = fitz.open(file_path)
343
- text_parts = []
 
 
 
 
 
 
 
 
 
344
 
345
- for page_num in range(min(15, len(doc))):
346
- page = doc[page_num]
347
- page_text = page.get_text()
348
-
349
- # Basic semantic filtering
350
- if (len(page_text.strip()) > 100 and
351
- 'ombudsman' not in page_text.lower()):
352
- text_parts.append(page_text)
353
-
354
- doc.close()
355
- full_text = "\n\n".join(text_parts)
356
-
357
- # Simple but effective chunking
358
- chunks = []
359
- sentences = re.split(r'(?<=[.!?])\s+', full_text)
360
- current_chunk = ""
361
-
362
- for sentence in sentences:
363
- if len(current_chunk) + len(sentence) <= 1000:
364
- current_chunk += sentence + " "
365
- else:
366
- if current_chunk.strip():
367
- chunks.append(DocumentChunk(
368
- content=current_chunk.strip(),
369
- metadata={
370
- "source": os.path.basename(file_path),
371
- "chunk_index": len(chunks),
372
- "document_type": "emergency_fallback"
373
- },
374
- chunk_id=str(uuid.uuid4())
375
- ))
376
- current_chunk = sentence + " "
377
-
378
- if current_chunk.strip():
379
- chunks.append(DocumentChunk(
380
- content=current_chunk.strip(),
381
- metadata={
382
- "source": os.path.basename(file_path),
383
- "chunk_index": len(chunks),
384
- "document_type": "emergency_fallback"
385
- },
386
- chunk_id=str(uuid.uuid4())
387
- ))
388
-
389
- return chunks[:100] # Limit for safety
390
-
391
  except Exception as e:
392
- logger.error(f"Emergency fallback failed: {e}")
393
- raise Exception("All processing methods failed")
394
-
395
- # --- GROQ LLM WRAPPER ---
396
-
397
- class GroqLLM(LLM):
398
- groq_client: Any
399
- api_key_manager: Any
400
-
401
- class Config:
402
- arbitrary_types_allowed = True
403
-
404
- @property
405
- def _llm_type(self) -> str:
406
- return "groq"
407
-
408
- def _call(self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None) -> str:
409
  try:
410
- api_key = self.api_key_manager.get_next_api_key()
411
- self.groq_client.api_key = api_key
412
-
413
- response = self.groq_client.chat.completions.create(
414
- model="llama-3.3-70b-versatile",
415
- messages=[{"role": "user", "content": prompt}],
416
- temperature=0.1,
417
- max_tokens=900,
418
- top_p=0.9,
419
- stop=stop
420
- )
421
-
422
- return response.choices[0].message.content.strip()
423
-
 
 
 
 
 
 
 
424
  except Exception as e:
425
- logger.error(f"Groq LLM call failed: {e}")
426
- return "Error generating response"
427
-
428
- # --- OPTIMIZED SEMANTIC RAG PIPELINE ---
429
-
430
- class OptimizedSemanticRAGPipeline:
431
- def __init__(self, collection_name: str, request: Request):
432
- self.collection_name = collection_name
433
- self.embedding_model = request.app.state.embedding_model
434
- self.groq_llm = request.app.state.groq_llm
435
 
436
- self.vectorstore = Chroma(
437
- collection_name=collection_name,
438
- embedding_function=self.embedding_model,
439
- persist_directory=CHROMA_PERSIST_DIR
440
- )
441
- self.qa_chain = None
442
- logger.info(f"βœ… Optimized semantic RAG pipeline initialized: {collection_name}")
443
-
444
- def clean_response(self, answer: str) -> str:
445
- """Comprehensive response cleaning for professional formatting"""
446
- if not answer:
447
- return answer
448
-
449
- # Remove excessive newlines first
450
- answer = re.sub(r'\n\s*\n\s*\n+', '\n\n', answer)
451
- answer = re.sub(r'\n\s*\n', '\n\n', answer)
452
-
453
- # Remove ALL excessive quotation marks - comprehensive patterns
454
- # Remove quotes around single words
455
- answer = re.sub(r'"(\w+)"', r'\1', answer)
456
-
457
- # Remove quotes around short phrases (2-5 words)
458
- answer = re.sub(r'"([^"]{1,50})"', r'\1', answer)
459
-
460
- # Remove quotes around ALL CAPS words/phrases
461
- answer = re.sub(r'"([A-Z\s]{2,50})"', r'\1', answer)
462
-
463
- # Remove quotes around numbers, percentages, amounts
464
- answer = re.sub(r'"(Rs\.?\s*[\d,]+[/-]*)"', r'\1', answer)
465
- answer = re.sub(r'"(\d+%)"', r'\1', answer)
466
- answer = re.sub(r'"(\d+\s*(?:days?|months?|years?|lacs?))"', r'\1', answer)
467
- answer = re.sub(r'"(\d+[.,]\d+)"', r'\1', answer)
468
-
469
- # Remove quotes around plan names and policy terms
470
- answer = re.sub(r'"(Plan\s+[A-Z])"', r'\1', answer)
471
- answer = re.sub(r'"([A-Z]+\s*[A-Z]*)"', r'\1', answer)
472
-
473
- # Clean up policy statement formats - make them flow naturally
474
- answer = re.sub(r'[Aa]s stated in the policy[:\s]*"([^"]+)"', r'As per the policy, \1', answer)
475
- answer = re.sub(r'[Aa]ccording to the policy[:\s]*"([^"]+)"', r'According to the policy, \1', answer)
476
- answer = re.sub(r'[Tt]he policy states[:\s]*"([^"]+)"', r'The policy states that \1', answer)
477
- answer = re.sub(r'[Aa]s per the policy[:\s]*"([^"]+)"', r'As per the policy, \1', answer)
478
- answer = re.sub(r'[Tt]he policy mentions[:\s]*"([^"]+)"', r'The policy mentions that \1', answer)
479
-
480
- # Remove quotes from technical terms and common insurance phrases
481
- insurance_terms = [
482
- 'sum insured', 'waiting period', 'grace period', 'pre-existing',
483
- 'cumulative bonus', 'no claim discount', 'room rent', 'icu charges',
484
- 'ayush', 'hospital', 'medical expenses', 'policy period', 'exclusion',
485
- 'inpatient', 'outpatient', 'domiciliary', 'cashless', 'reimbursement'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  ]
487
 
488
- for term in insurance_terms:
489
- # Remove quotes around these terms (case insensitive)
490
- pattern = f'"{re.escape(term)}"'
491
- answer = re.sub(pattern, term, answer, flags=re.IGNORECASE)
492
- # Also handle capitalized versions
493
- pattern = f'"{re.escape(term.upper())}"'
494
- answer = re.sub(pattern, term.upper(), answer, flags=re.IGNORECASE)
495
 
496
- # Clean up remaining problematic quote patterns
497
- answer = re.sub(r'"\s*([^"]*)\s*"', r'\1', answer) # Any remaining quoted text
 
 
 
498
 
499
- # Fix spacing issues
500
- answer = re.sub(r'\s+', ' ', answer) # Multiple spaces to single
501
- answer = answer.replace(' ,', ',') # Space before comma
502
- answer = answer.replace(' .', '.') # Space before period
503
- answer = answer.replace('( ', '(') # Space after opening parenthesis
504
- answer = answer.replace(' )', ')') # Space before closing parenthesis
505
 
506
- # Clean up line breaks within sentences
507
- answer = re.sub(r'([a-z,])\s*\n\s*([a-z])', r'\1 \2', answer)
508
 
509
- # Final cleanup - remove any remaining escape characters
510
- answer = answer.replace('\\"', '"') # Remove escape characters
511
- answer = answer.replace('\\n', ' ') # Convert literal \n to space
512
- answer = answer.replace('\\"', '') # Remove any remaining escaped quotes
 
 
 
513
 
514
- # Ensure proper sentence structure
515
- answer = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', answer) # Space after sentence end
 
516
 
517
- return answer.strip()
518
-
519
- def add_documents(self, chunks: List[Dict[str, Any]]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  if not chunks:
521
- logger.error("❌ No chunks provided!")
522
  return
523
-
524
- logger.info(f"πŸ“š Adding {len(chunks)} chunks with optimized semantic processing...")
525
 
526
- # Optimized semantic filtering
 
 
527
  quality_chunks = []
528
  for chunk in chunks:
529
  content = chunk['content']
530
- metadata = chunk.get('metadata', {})
531
 
532
- # Multi-factor quality assessment
533
- quality_factors = []
 
534
 
535
- # Length factor
536
- if len(content) > 120:
537
- quality_factors.append(1)
538
-
539
- # Insurance relevance factor
540
- insurance_terms = ['policy', 'coverage', 'benefit', 'exclusion', 'claim', 'premium',
541
- 'hospital', 'medical', 'treatment', 'waiting', 'insured']
542
- term_count = sum(1 for term in insurance_terms if term in content.lower())
543
- if term_count >= 2:
544
- quality_factors.append(2)
545
 
546
- # Content type factor
547
- content_types = metadata.get('content_types', '')
548
- if content_types and content_types != 'general':
549
- quality_factors.append(1)
 
550
 
551
- # Noise penalty
552
- if any(noise in content.lower() for noise in ['ombudsman', 'lalit bhawan']):
553
- quality_factors.append(-2)
 
554
 
555
- # Calculate final quality score
556
- quality_score = sum(quality_factors)
 
 
557
 
558
- if quality_score > 0:
559
  quality_chunks.append(chunk)
560
-
561
- # Sort by relevance score if available
562
- quality_chunks.sort(key=lambda x: x['metadata'].get('relevance_score', 0), reverse=True)
563
 
564
- # Limit for memory efficiency while keeping quality
565
- if len(quality_chunks) > 120:
566
- quality_chunks = quality_chunks[:120]
567
-
568
- logger.info(f"πŸ“š Filtered to {len(quality_chunks)} high-quality semantic chunks")
569
-
570
- langchain_docs = [
571
- LangChainDocument(page_content=chunk['content'], metadata=chunk['metadata'])
572
  for chunk in quality_chunks
573
  ]
574
-
575
- self.vectorstore.add_documents(langchain_docs)
576
- logger.info(f"βœ… Added {len(langchain_docs)} semantic documents to vectorstore")
577
-
578
- # Optimized retriever with semantic search
579
- retriever = self.vectorstore.as_retriever(
580
- search_type="mmr", # Keep MMR for diversity
581
- search_kwargs={
582
- "k": 10, # Balanced retrieval
583
- "fetch_k": 20, # Reasonable search space
584
- "lambda_mult": 0.6 # Balance relevance vs diversity
585
- }
586
- )
587
-
588
- # Enhanced semantic prompt template with strict formatting rules
589
- prompt_template = PromptTemplate(
590
- input_variables=["context", "question"],
591
- template="""You are an expert insurance policy analyst. Analyze the policy document context to provide accurate, detailed answers.
592
-
593
- POLICY DOCUMENT CONTEXT:
594
- {context}
595
-
596
- QUESTION: {question}
597
-
598
- CRITICAL FORMATTING INSTRUCTIONS:
599
- - Write in natural, flowing sentences without excessive quotation marks
600
- - When referencing policy text, paraphrase or integrate naturally into sentences
601
- - Do NOT put quotes around single words, numbers, percentages, or short phrases
602
- - Do NOT put quotes around plan names (Plan A), amounts (Rs. 5,000), or time periods (30 days)
603
- - Write numbers and amounts directly: 30 days, 5%, Rs. 10,000, Plan A
604
- - Use quotes ONLY for exact lengthy policy clauses that need verbatim citation
605
- - Make the text read like professional analysis, not a quote-heavy document
606
-
607
- ANALYSIS INSTRUCTIONS:
608
- - Extract specific facts: numbers, percentages, time periods, conditions
609
- - Understand relationships between different policy sections
610
- - Be precise about conditions, exceptions, and qualifying circumstances
611
- - If information is partial, state what's available and note limitations
612
-
613
- RESPONSE STYLE:
614
- Write a comprehensive, naturally flowing analysis that reads professionally without excessive quotation marks or formatting issues.
615
-
616
- ANSWER:"""
617
- )
618
-
619
- self.qa_chain = RetrievalQA.from_chain_type(
620
- llm=self.groq_llm,
621
- chain_type="stuff",
622
- retriever=retriever,
623
- chain_type_kwargs={"prompt": prompt_template},
624
- return_source_documents=True
625
- )
626
-
627
- logger.info("βœ… Optimized semantic QA Chain ready")
628
-
629
  async def answer_question(self, question: str) -> str:
630
- if not self.qa_chain:
631
- return "Error: Semantic QA chain not initialized."
632
-
633
- logger.info(f"πŸ€” Semantic analysis for: {question}")
634
-
635
  try:
636
- # Retrieve with semantic understanding
637
- result = await asyncio.to_thread(self.qa_chain, {"query": question})
638
- raw_answer = result.get("result", "Failed to generate semantic answer.")
 
 
 
 
 
 
639
 
640
- # Clean up the response formatting
641
- clean_answer = self.clean_response(raw_answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
642
 
643
- logger.info(f"βœ… Semantic answer generated: {len(clean_answer)} characters")
644
- return clean_answer
645
-
646
  except Exception as e:
647
- logger.error(f"❌ Error during semantic QA: {e}")
648
- return "An error occurred while processing the semantic question."
649
-
650
- # --- API KEY MANAGER ---
651
-
652
- class GroqAPIKeyManager:
653
- def __init__(self, api_keys: List[str]):
654
- self.api_keys = [key.strip() for key in api_keys if key.strip()]
655
- self.key_usage_count = defaultdict(int)
656
- self.current_key_index = 0
657
- logger.info(f"πŸ”‘ API Key Manager: {len(self.api_keys)} keys")
658
 
659
- def get_next_api_key(self):
660
- if not self.api_keys:
661
- raise ValueError("No API keys available")
662
-
663
- key = self.api_keys[self.current_key_index % len(self.api_keys)]
664
- self.current_key_index += 1
665
- return key
666
 
667
- # --- CONFIGURATION ---
668
 
669
- GROQ_API_KEYS = os.getenv("GROQ_API_KEYS", "").split(',')
670
- EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
671
- CHROMA_PERSIST_DIR = "/tmp/chroma_db_storage"
672
- UPLOAD_DIR = "/tmp/docs"
 
 
 
 
673
 
674
- @app.on_event("startup")
675
- async def startup_event():
676
- try:
677
- logger.info("πŸš€ Initializing optimized semantic services...")
678
-
679
- app.state.embedding_model = HuggingFaceEmbeddings(
680
- model_name=EMBEDDING_MODEL,
681
- model_kwargs={'device': 'cpu'},
682
- encode_kwargs={'normalize_embeddings': True}
683
- )
684
-
685
- app.state.api_key_manager = GroqAPIKeyManager(GROQ_API_KEYS)
686
- first_key = app.state.api_key_manager.get_next_api_key()
687
- app.state.groq_client = groq.Groq(api_key=first_key)
688
- app.state.groq_llm = GroqLLM(groq_client=app.state.groq_client, api_key_manager=app.state.api_key_manager)
689
 
690
- app.state.parsing_service = OptimizedSemanticParser()
 
 
 
691
 
692
- logger.info("βœ… All optimized semantic services initialized!")
 
693
 
694
- except Exception as e:
695
- logger.error(f"πŸ’₯ FATAL ERROR: {e}")
696
- raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
 
698
- # --- API MODELS (FIXED FOR HACKATHON) ---
699
 
700
  class SubmissionRequest(BaseModel):
701
  documents: List[str]
702
  questions: List[str]
703
 
704
- class Config:
705
- schema_extra = {
706
- "example": {
707
- "documents": ["https://example.com/document1.pdf"],
708
- "questions": ["What is the grace period?", "What are the exclusions?"]
709
- }
710
- }
711
-
712
  class SubmissionResponse(BaseModel):
713
- answers: List[str] # βœ… Fixed: Just strings, not objects
714
-
715
- class Config:
716
- schema_extra = {
717
- "example": {
718
- "answers": [
719
- "The grace period is 30 days for premium payment.",
720
- "The main exclusions include pre-existing diseases for 36 months."
721
- ]
722
- }
723
- }
724
 
725
- # --- MAIN ENDPOINT WITH AUTHENTICATION (FIXED FORMAT) ---
726
 
727
  @app.post("/hackrx/run", response_model=SubmissionResponse, dependencies=[Depends(verify_bearer_token)])
728
  async def run_submission(request: Request, submission_request: SubmissionRequest = Body(...)):
 
 
 
729
  try:
730
- logger.info(f"🎯 Processing {len(submission_request.documents)} documents, {len(submission_request.questions)} questions")
731
-
732
- parsing_service = request.app.state.parsing_service
733
- session_collection_name = f"opt_semantic_{uuid.uuid4().hex[:8]}"
734
- rag_pipeline = OptimizedSemanticRAGPipeline(collection_name=session_collection_name, request=request)
735
 
 
736
  all_chunks = []
737
-
738
- # Process documents with optimized semantics
739
- async with httpx.AsyncClient(timeout=90.0) as client:
740
- for doc_idx, doc_url in enumerate(submission_request.documents):
741
- try:
742
- logger.info(f"πŸ“₯ Downloading document {doc_idx + 1}")
743
- response = await client.get(doc_url, follow_redirects=True)
744
- response.raise_for_status()
745
-
746
- file_name = f"doc_{doc_idx}_{uuid.uuid4().hex[:8]}.pdf"
747
- temp_file_path = os.path.join(UPLOAD_DIR, file_name)
748
- os.makedirs(UPLOAD_DIR, exist_ok=True)
749
-
750
- with open(temp_file_path, "wb") as f:
751
- f.write(response.content)
752
-
753
- logger.info(f"πŸ“„ Processing with optimized semantics...")
754
- chunks = parsing_service.process_pdf_optimized_semantic(temp_file_path)
755
- chunk_dicts = [chunk.to_dict() for chunk in chunks]
756
- all_chunks.extend(chunk_dicts)
757
-
758
- os.remove(temp_file_path)
759
- logger.info(f"βœ… Processed {len(chunks)} semantic chunks")
760
-
761
- except Exception as e:
762
- logger.error(f"❌ Document processing failed: {e}")
763
- continue
764
-
765
- logger.info(f"πŸ“Š Total semantic chunks: {len(all_chunks)}")
766
-
 
 
 
 
 
 
 
 
 
767
  if not all_chunks:
768
- logger.error("❌ No chunks processed!")
769
- # βœ… Fixed: Return just strings
770
  return SubmissionResponse(answers=[
771
- "Document processing failed." for _ in submission_request.questions
 
772
  ])
773
-
774
- # Add to semantic RAG pipeline
775
- rag_pipeline.add_documents(all_chunks)
776
-
777
- # Answer questions with semantic understanding
778
- logger.info(f"❓ Answering questions with optimized semantics...")
779
- answers = [] # βœ… Fixed: Just collect string answers
780
 
781
- for question in submission_request.questions:
782
- try:
783
- answer = await rag_pipeline.answer_question(question)
784
- answers.append(answer) # βœ… Fixed: Just append the string answer
785
- except Exception as e:
786
- logger.error(f"❌ Question failed: {e}")
787
- answers.append("Failed to process question.") # βœ… Fixed: Just string
 
788
 
789
- logger.info("πŸŽ‰ All semantic questions processed!")
790
- return SubmissionResponse(answers=answers) # βœ… Fixed: Just the string list
 
 
 
 
 
 
 
 
 
791
 
792
  except Exception as e:
793
- logger.error(f"πŸ’₯ CRITICAL ERROR: {e}")
794
- # βœ… Fixed: Return just strings
 
795
  return SubmissionResponse(answers=[
796
- f"System error: {str(e)}" for _ in submission_request.questions
 
797
  ])
798
 
 
 
799
  @app.get("/")
800
  def read_root():
801
- return {"message": "Optimized Semantic RAG System", "status": "healthy"}
 
 
 
 
 
 
 
 
 
 
 
 
 
802
 
803
  @app.get("/health")
804
  def health_check():
805
- return {"status": "healthy", "version": "2.1.0"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- ULTIMATE HACKATHON WINNING RAG SYSTEM ---
2
 
3
  import os
4
  import json
5
  import uuid
6
  import time
7
  import re
 
 
8
  import asyncio
9
+ import logging
10
+ from typing import List, Dict, Any, Optional, Union
11
  from collections import defaultdict
12
+ from itertools import cycle
13
+ import hashlib
14
+ import mimetypes
15
+ from pathlib import Path
16
 
17
  # FastAPI and core dependencies
18
  from fastapi import FastAPI, Body, HTTPException, Request, Depends, Header
 
28
  from langchain.callbacks.manager import CallbackManagerForLLMRun
29
  from langchain.schema.document import Document as LangChainDocument
30
 
31
+ # Multi-format document processing
32
  import fitz # PyMuPDF
33
  import pdfplumber
34
+ import docx # python-docx
35
+ import openpyxl
36
+ import csv
37
+ import zipfile
38
+ import rarfile
39
+ import email
40
+ from email.policy import default
41
+ import eml_parser
42
+ from bs4 import BeautifulSoup
43
+ import xml.etree.ElementTree as ET
44
+
45
+ # Multiple LLM providers
46
  import groq
47
+ import openai
48
+ import google.generativeai as genai
49
+
50
+ # Other dependencies
51
  import httpx
52
  from dotenv import load_dotenv
53
+ import cachetools
54
+ import threading
55
+ from concurrent.futures import ThreadPoolExecutor, as_completed
56
 
57
  # Setup
58
  load_dotenv()
59
  logging.basicConfig(level=logging.INFO)
60
  logger = logging.getLogger(__name__)
61
 
62
+ app = FastAPI(title="Ultimate Hackathon RAG System", version="3.0.0")
63
 
64
+ # Enhanced CORS for all scenarios
65
  app.add_middleware(
66
  CORSMiddleware,
67
+ allow_origins=["*"],
68
+ allow_credentials=True,
69
+ allow_methods=["*"],
70
+ allow_headers=["*"],
71
  )
72
 
73
+ # --- ANTI-JAILBREAK SECURITY SYSTEM ---
74
 
75
+ class SecurityGuard:
76
+ def __init__(self):
77
+ self.jailbreak_patterns = [
78
+ r'ignore.*previous.*instructions',
79
+ r'act.*as.*different.*character',
80
+ r'generate.*code.*(?:javascript|python|html)',
81
+ r'write.*program',
82
+ r'roleplay.*as',
83
+ r'pretend.*you.*are',
84
+ r'system.*prompt',
85
+ r'override.*settings',
86
+ r'bypass.*restrictions',
87
+ r'admin.*mode',
88
+ r'developer.*mode',
89
+ r'tell.*me.*about.*yourself',
90
+ r'what.*are.*you',
91
+ r'who.*created.*you'
92
+ ]
93
+
94
+ def detect_jailbreak(self, text: str) -> bool:
95
+ """Detect jailbreak attempts"""
96
+ text_lower = text.lower()
97
+ return any(re.search(pattern, text_lower) for pattern in self.jailbreak_patterns)
98
 
99
+ def sanitize_response(self, question: str, answer: str) -> str:
100
+ """Sanitize responses against jailbreaks"""
101
+ if self.detect_jailbreak(question):
102
+ return "I can only provide information based on the document content provided. Please ask questions about the document."
103
+
104
+ # Remove any potential code or script tags
105
+ answer = re.sub(r'<script.*?</script>', '', answer, flags=re.DOTALL | re.IGNORECASE)
106
+ answer = re.sub(r'<.*?>', '', answer) # Remove HTML tags
107
+
108
+ return answer
109
+
110
+ # --- MULTI-LLM PROVIDER SYSTEM ---
111
+
112
+ class MultiLLMManager:
113
+ def __init__(self):
114
+ # Initialize multiple LLM providers
115
+ self.groq_keys = cycle([k.strip() for k in os.getenv("GROQ_API_KEYS", "").split(',') if k.strip()])
116
+ self.openai_keys = cycle([k.strip() for k in os.getenv("OPENAI_API_KEYS", "").split(',') if k.strip()])
117
+ self.gemini_keys = cycle([k.strip() for k in os.getenv("GEMINI_API_KEYS", "").split(',') if k.strip()])
118
+
119
+ self.providers = ['groq', 'openai', 'gemini']
120
+ self.current_provider_index = 0
121
+
122
+ logger.info("πŸ”‘ Multi-LLM Manager initialized with fallback support")
123
 
124
+ async def get_response(self, prompt: str, max_tokens: int = 900) -> str:
125
+ """Get response with automatic fallback between providers"""
126
+ for attempt in range(len(self.providers)):
127
+ try:
128
+ provider = self.providers[self.current_provider_index]
129
+
130
+ if provider == 'groq':
131
+ return await self._groq_response(prompt, max_tokens)
132
+ elif provider == 'openai':
133
+ return await self._openai_response(prompt, max_tokens)
134
+ elif provider == 'gemini':
135
+ return await self._gemini_response(prompt, max_tokens)
136
+
137
+ except Exception as e:
138
+ logger.warning(f"{provider} failed: {e}")
139
+ self.current_provider_index = (self.current_provider_index + 1) % len(self.providers)
140
+ continue
141
+
142
+ return "Error: All LLM providers failed"
143
 
144
+ async def _groq_response(self, prompt: str, max_tokens: int) -> str:
145
+ key = next(self.groq_keys)
146
+ client = groq.Groq(api_key=key)
147
+
148
+ response = client.chat.completions.create(
149
+ model="llama-3.3-70b-versatile",
150
+ messages=[{"role": "user", "content": prompt}],
151
+ temperature=0.1,
152
+ max_tokens=max_tokens,
153
+ top_p=0.9
154
+ )
155
+ return response.choices[0].message.content.strip()
156
 
157
+ async def _openai_response(self, prompt: str, max_tokens: int) -> str:
158
+ key = next(self.openai_keys)
159
+ openai.api_key = key
160
+
161
+ response = await openai.ChatCompletion.acreate(
162
+ model="gpt-4o-mini",
163
+ messages=[{"role": "user", "content": prompt}],
164
+ temperature=0.1,
165
+ max_tokens=max_tokens
166
+ )
167
+ return response.choices[0].message.content.strip()
168
+
169
+ async def _gemini_response(self, prompt: str, max_tokens: int) -> str:
170
+ key = next(self.gemini_keys)
171
+ genai.configure(api_key=key)
172
+
173
+ model = genai.GenerativeModel('gemini-pro')
174
+ response = await model.generate_content_async(prompt)
175
+ return response.text.strip()
176
 
177
+ # --- UNIVERSAL DOCUMENT PROCESSOR ---
 
 
 
 
 
178
 
179
+ class UniversalDocumentProcessor:
180
  def __init__(self):
 
181
  self.chunk_size = 1200
182
+ self.chunk_overlap = 200
183
+ self.max_chunks = 250
184
+ self.max_pages = 30
185
+
186
+ # Smart caching system
187
+ self.cache = cachetools.TTLCache(maxsize=100, ttl=3600) # 1 hour TTL
188
+ self.security_guard = SecurityGuard()
189
+
190
+ # Supported formats
191
+ self.processors = {
192
+ '.pdf': self.process_pdf,
193
+ '.docx': self.process_docx,
194
+ '.doc': self.process_doc,
195
+ '.xlsx': self.process_excel,
196
+ '.xls': self.process_excel,
197
+ '.csv': self.process_csv,
198
+ '.txt': self.process_text,
199
+ '.html': self.process_html,
200
+ '.xml': self.process_xml,
201
+ '.eml': self.process_email,
202
+ '.zip': self.process_archive,
203
+ '.rar': self.process_archive,
204
+ '.json': self.process_json
205
+ }
206
 
207
+ logger.info("πŸš€ Universal Document Processor initialized")
208
+
209
+ def get_file_hash(self, content: bytes) -> str:
210
+ """Generate hash for caching"""
211
+ return hashlib.md5(content).hexdigest()
212
+
213
+ async def process_document(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
214
+ """Process any document format with caching"""
215
+ file_hash = self.get_file_hash(content)
216
 
217
+ # Check cache first
218
+ if file_hash in self.cache:
219
+ logger.info(f"πŸ“¦ Cache hit for {os.path.basename(file_path)}")
220
+ return self.cache[file_hash]
 
 
 
221
 
222
+ # Detect file type
223
+ file_ext = Path(file_path).suffix.lower()
224
+ if not file_ext:
225
+ file_ext = self._detect_file_type(content)
 
226
 
227
+ # Process based on file type
228
+ processor = self.processors.get(file_ext, self.process_text)
229
 
230
+ try:
231
+ chunks = await processor(file_path, content)
 
 
 
232
 
233
+ # Cache the result
234
+ self.cache[file_hash] = chunks
235
+
236
+ logger.info(f"βœ… Processed {os.path.basename(file_path)}: {len(chunks)} chunks")
237
+ return chunks
238
+
239
+ except Exception as e:
240
+ logger.error(f"❌ Processing failed for {file_path}: {e}")
241
+ return self._emergency_text_extraction(content, file_path)
242
+
243
+ def _detect_file_type(self, content: bytes) -> str:
244
+ """Detect file type from content"""
245
+ if content.startswith(b'%PDF'):
246
+ return '.pdf'
247
+ elif content.startswith(b'PK'):
248
+ return '.docx' if b'word/' in content[:1000] else '.zip'
249
+ elif content.startswith(b'<html') or content.startswith(b'<!DOCTYPE'):
250
+ return '.html'
251
+ elif content.startswith(b'<?xml'):
252
+ return '.xml'
253
+ else:
254
+ return '.txt'
255
+
256
+ # --- PDF PROCESSING (Enhanced) ---
257
+ async def process_pdf(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
258
+ """Enhanced PDF processing with tables and images"""
259
  chunks = []
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
+ with open(file_path, 'wb') as f:
262
+ f.write(content)
 
 
 
 
263
 
264
+ try:
265
+ # Extract text with PyMuPDF
266
+ doc = fitz.open(file_path)
267
+ full_text = ""
268
+
269
+ for page_num in range(min(len(doc), self.max_pages)):
270
+ page = doc[page_num]
271
+
272
+ # Extract text
273
+ text = page.get_text()
274
+
275
+ # Extract images as context (if they contain text)
276
+ image_list = page.get_images()
277
+ for img in image_list[:3]: # Limit images
278
+ try:
279
+ xref = img[0]
280
+ base_image = doc.extract_image(xref)
281
+ # Could add OCR here if needed
282
+ except:
283
+ pass
284
+
285
+ if text.strip():
286
+ full_text += f"\n\nPage {page_num + 1}:\n{self._clean_text(text)}"
287
+
288
+ doc.close()
289
+
290
+ # Extract tables with pdfplumber
291
+ table_text = await self._extract_pdf_tables(file_path)
292
+ if table_text:
293
+ full_text += f"\n\n=== TABLES ===\n{table_text}"
294
+
295
+ # Create semantic chunks
296
+ chunks = self._create_semantic_chunks(full_text, file_path, "pdf")
297
+
298
+ except Exception as e:
299
+ logger.error(f"PDF processing error: {e}")
300
+ chunks = self._emergency_text_extraction(content, file_path)
301
 
302
+ finally:
303
+ if os.path.exists(file_path):
304
+ os.remove(file_path)
305
 
306
  return chunks
307
+
308
+ async def _extract_pdf_tables(self, file_path: str) -> str:
309
+ """Extract tables from PDF"""
310
  table_text = ""
 
 
 
311
  try:
312
  with pdfplumber.open(file_path) as pdf:
313
+ for page_num, page in enumerate(pdf.pages[:15]):
 
 
 
 
 
 
 
314
  tables = page.find_tables()
315
+ for i, table in enumerate(tables[:3]):
 
 
 
 
316
  try:
317
  table_data = table.extract()
318
+ if table_data and len(table_data) > 1:
319
+ table_md = f"\n**Table {i+1} (Page {page_num+1})**\n"
320
+ for row in table_data[:20]:
321
+ if row:
322
+ clean_row = [str(cell or "").strip()[:50] for cell in row]
323
+ table_md += "| " + " | ".join(clean_row) + " |\n"
324
+ table_text += table_md + "\n"
325
+ except:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  continue
 
 
 
327
  except Exception as e:
328
+ logger.warning(f"Table extraction failed: {e}")
329
+
330
  return table_text
331
+
332
+ # --- DOCX/DOC PROCESSING ---
333
+ async def process_docx(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
334
+ """Process DOCX files"""
335
+ with open(file_path, 'wb') as f:
336
+ f.write(content)
337
+
338
  try:
339
+ doc = docx.Document(file_path)
 
340
  full_text = ""
 
341
 
342
+ # Extract paragraphs
343
+ for para in doc.paragraphs:
344
+ if para.text.strip():
345
+ full_text += para.text + "\n"
346
 
347
+ # Extract tables
348
+ for table in doc.tables:
349
+ table_text = "\n**TABLE**\n"
350
+ for row in table.rows:
351
+ row_text = []
352
+ for cell in row.cells:
353
+ row_text.append(cell.text.strip())
354
+ table_text += "| " + " | ".join(row_text) + " |\n"
355
+ full_text += table_text + "\n"
356
+
357
+ chunks = self._create_semantic_chunks(full_text, file_path, "docx")
358
+
359
+ except Exception as e:
360
+ logger.error(f"DOCX processing error: {e}")
361
+ chunks = self._emergency_text_extraction(content, file_path)
362
+
363
+ finally:
364
+ if os.path.exists(file_path):
365
+ os.remove(file_path)
366
+
367
+ return chunks
368
+
369
+ async def process_doc(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
370
+ """Process DOC files (fallback to text extraction)"""
371
+ return self._emergency_text_extraction(content, file_path)
372
+
373
+ # --- EXCEL PROCESSING ---
374
+ async def process_excel(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
375
+ """Process Excel files"""
376
+ with open(file_path, 'wb') as f:
377
+ f.write(content)
378
+
379
+ try:
380
+ workbook = openpyxl.load_workbook(file_path, read_only=True)
381
+ full_text = ""
382
+
383
+ for sheet_name in workbook.sheetnames[:5]: # Max 5 sheets
384
+ sheet = workbook[sheet_name]
385
+ full_text += f"\n**Sheet: {sheet_name}**\n"
386
 
387
+ # Get data as table
388
+ data = []
389
+ for row in sheet.iter_rows(max_row=min(sheet.max_row, 100), values_only=True):
390
+ if any(cell for cell in row): # Skip empty rows
391
+ data.append([str(cell or "").strip() for cell in row])
 
 
 
 
 
 
392
 
393
+ if data:
394
+ # Format as table
395
+ for row in data:
396
+ full_text += "| " + " | ".join(row[:10]) + " |\n" # Max 10 columns
397
 
398
+ full_text += "\n"
399
+
400
+ workbook.close()
401
+ chunks = self._create_semantic_chunks(full_text, file_path, "excel")
402
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  except Exception as e:
404
+ logger.error(f"Excel processing error: {e}")
405
+ chunks = self._emergency_text_extraction(content, file_path)
406
+
407
+ finally:
408
+ if os.path.exists(file_path):
409
+ os.remove(file_path)
410
+
411
+ return chunks
412
+
413
+ # --- CSV PROCESSING ---
414
+ async def process_csv(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
415
+ """Process CSV files"""
416
  try:
417
+ text_content = content.decode('utf-8', errors='ignore')
418
+ lines = text_content.split('\n')
419
+
420
+ full_text = "**CSV DATA**\n"
421
+ for i, line in enumerate(lines[:200]): # Max 200 rows
422
+ if line.strip():
423
+ # Parse CSV row
424
+ row_data = next(csv.reader([line]))
425
+ full_text += "| " + " | ".join(str(cell).strip()[:50] for cell in row_data) + " |\n"
426
+
427
+ chunks = self._create_semantic_chunks(full_text, file_path, "csv")
428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  except Exception as e:
430
+ logger.error(f"CSV processing error: {e}")
431
+ chunks = self._emergency_text_extraction(content, file_path)
432
+
433
+ return chunks
434
+
435
+ # --- EMAIL PROCESSING ---
436
+ async def process_email(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
437
+ """Process email files"""
 
 
 
 
 
 
 
 
 
438
  try:
439
+ # Parse email
440
+ msg = email.message_from_bytes(content, policy=default)
441
+
442
+ full_text = f"**EMAIL**\n"
443
+ full_text += f"From: {msg.get('From', 'Unknown')}\n"
444
+ full_text += f"To: {msg.get('To', 'Unknown')}\n"
445
+ full_text += f"Subject: {msg.get('Subject', 'No Subject')}\n"
446
+ full_text += f"Date: {msg.get('Date', 'Unknown')}\n\n"
447
+
448
+ # Extract body
449
+ if msg.is_multipart():
450
+ for part in msg.walk():
451
+ if part.get_content_type() == "text/plain":
452
+ body = part.get_content()
453
+ full_text += f"Content:\n{body}\n"
454
+ else:
455
+ body = msg.get_content()
456
+ full_text += f"Content:\n{body}\n"
457
+
458
+ chunks = self._create_semantic_chunks(full_text, file_path, "email")
459
+
460
  except Exception as e:
461
+ logger.error(f"Email processing error: {e}")
462
+ chunks = self._emergency_text_extraction(content, file_path)
 
 
 
 
 
 
 
 
463
 
464
+ return chunks
465
+
466
+ # --- HTML/XML PROCESSING ---
467
+ async def process_html(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
468
+ """Process HTML files"""
469
+ try:
470
+ soup = BeautifulSoup(content, 'html.parser')
471
+
472
+ # Remove script and style tags
473
+ for script in soup(["script", "style"]):
474
+ script.decompose()
475
+
476
+ # Extract text
477
+ text = soup.get_text()
478
+
479
+ chunks = self._create_semantic_chunks(text, file_path, "html")
480
+
481
+ except Exception as e:
482
+ logger.error(f"HTML processing error: {e}")
483
+ chunks = self._emergency_text_extraction(content, file_path)
484
+
485
+ return chunks
486
+
487
+ async def process_xml(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
488
+ """Process XML files"""
489
+ try:
490
+ root = ET.fromstring(content)
491
+
492
+ def extract_text(element, level=0):
493
+ text = ""
494
+ if element.text and element.text.strip():
495
+ text += f"{' ' * level}{element.tag}: {element.text.strip()}\n"
496
+ for child in element:
497
+ text += extract_text(child, level + 1)
498
+ return text
499
+
500
+ full_text = extract_text(root)
501
+ chunks = self._create_semantic_chunks(full_text, file_path, "xml")
502
+
503
+ except Exception as e:
504
+ logger.error(f"XML processing error: {e}")
505
+ chunks = self._emergency_text_extraction(content, file_path)
506
+
507
+ return chunks
508
+
509
+ # --- ARCHIVE PROCESSING ---
510
+ async def process_archive(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
511
+ """Process ZIP/RAR archives"""
512
+ with open(file_path, 'wb') as f:
513
+ f.write(content)
514
+
515
+ chunks = []
516
+ try:
517
+ if file_path.endswith('.zip'):
518
+ with zipfile.ZipFile(file_path, 'r') as zip_file:
519
+ for file_info in zip_file.filelist[:10]: # Max 10 files
520
+ try:
521
+ file_content = zip_file.read(file_info)
522
+ sub_chunks = await self.process_document(file_info.filename, file_content)
523
+ chunks.extend(sub_chunks)
524
+ except:
525
+ continue
526
+
527
+ # Could add RAR support here if needed
528
+
529
+ except Exception as e:
530
+ logger.error(f"Archive processing error: {e}")
531
+ chunks = self._emergency_text_extraction(content, file_path)
532
+
533
+ finally:
534
+ if os.path.exists(file_path):
535
+ os.remove(file_path)
536
+
537
+ return chunks
538
+
539
+ # --- JSON PROCESSING ---
540
+ async def process_json(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
541
+ """Process JSON files"""
542
+ try:
543
+ data = json.loads(content.decode('utf-8'))
544
+ full_text = json.dumps(data, indent=2, ensure_ascii=False)
545
+ chunks = self._create_semantic_chunks(full_text, file_path, "json")
546
+ except Exception as e:
547
+ logger.error(f"JSON processing error: {e}")
548
+ chunks = self._emergency_text_extraction(content, file_path)
549
+
550
+ return chunks
551
+
552
+ # --- TEXT PROCESSING ---
553
+ async def process_text(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
554
+ """Process plain text files"""
555
+ try:
556
+ text = content.decode('utf-8', errors='ignore')
557
+ chunks = self._create_semantic_chunks(text, file_path, "text")
558
+ except Exception as e:
559
+ logger.error(f"Text processing error: {e}")
560
+ chunks = []
561
+
562
+ return chunks
563
+
564
+ # --- UTILITY METHODS ---
565
+ def _clean_text(self, text: str) -> str:
566
+ """Clean extracted text"""
567
+ # Remove excessive whitespace
568
+ text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
569
+ text = re.sub(r'\s+', ' ', text)
570
+
571
+ # Remove noise
572
+ noise_patterns = [
573
+ r'Office of the Insurance Ombudsman.*?\n',
574
+ r'Lalit Bhawan.*?\n',
575
+ r'^\d+\s*$'
576
  ]
577
 
578
+ for pattern in noise_patterns:
579
+ text = re.sub(pattern, '', text, flags=re.MULTILINE)
 
 
 
 
 
580
 
581
+ return text.strip()
582
+
583
+ def _create_semantic_chunks(self, text: str, source: str, doc_type: str) -> List[Dict[str, Any]]:
584
+ """Create semantic chunks from text"""
585
+ text = self._clean_text(text)
586
 
587
+ if not text or len(text) < 50:
588
+ return []
 
 
 
 
589
 
590
+ # Semantic boundary detection
591
+ boundaries = [0]
592
 
593
+ # Look for section markers
594
+ section_patterns = [
595
+ r'\n\s*(?:\d+\.)+\s*[A-Z]',
596
+ r'\n\s*[A-Z][A-Z\s]{8,}:',
597
+ r'\n\s*(?:TABLE|SECTION|PART)',
598
+ r'\n\s*\*\*[^*]+\*\*'
599
+ ]
600
 
601
+ for pattern in section_patterns:
602
+ for match in re.finditer(pattern, text):
603
+ boundaries.append(match.start())
604
 
605
+ boundaries.append(len(text))
606
+ boundaries = sorted(set(boundaries))
607
+
608
+ chunks = []
609
+ for i in range(len(boundaries) - 1):
610
+ start = boundaries[i]
611
+ end = boundaries[i + 1]
612
+ chunk_text = text[start:end].strip()
613
+
614
+ if len(chunk_text) > self.chunk_size:
615
+ # Split large chunks
616
+ sub_chunks = self._split_large_chunk(chunk_text)
617
+ for j, sub_chunk in enumerate(sub_chunks):
618
+ chunks.append({
619
+ "content": sub_chunk,
620
+ "metadata": {
621
+ "source": os.path.basename(source),
622
+ "chunk_index": len(chunks),
623
+ "document_type": doc_type,
624
+ "chunk_length": len(sub_chunk),
625
+ "is_sub_chunk": True,
626
+ "parent_chunk": i
627
+ },
628
+ "chunk_id": str(uuid.uuid4())
629
+ })
630
+ elif len(chunk_text) > 100:
631
+ chunks.append({
632
+ "content": chunk_text,
633
+ "metadata": {
634
+ "source": os.path.basename(source),
635
+ "chunk_index": len(chunks),
636
+ "document_type": doc_type,
637
+ "chunk_length": len(chunk_text),
638
+ "is_sub_chunk": False
639
+ },
640
+ "chunk_id": str(uuid.uuid4())
641
+ })
642
+
643
+ return chunks[:self.max_chunks]
644
+
645
+ def _split_large_chunk(self, text: str) -> List[str]:
646
+ """Split large chunks intelligently"""
647
+ chunks = []
648
+ sentences = re.split(r'(?<=[.!?])\s+', text)
649
+
650
+ current_chunk = ""
651
+ for sentence in sentences:
652
+ if len(current_chunk) + len(sentence) <= self.chunk_size:
653
+ current_chunk += sentence + " "
654
+ else:
655
+ if current_chunk.strip():
656
+ chunks.append(current_chunk.strip())
657
+ current_chunk = sentence + " "
658
+
659
+ if current_chunk.strip():
660
+ chunks.append(current_chunk.strip())
661
+
662
+ return chunks
663
+
664
+ def _emergency_text_extraction(self, content: bytes, file_path: str) -> List[Dict[str, Any]]:
665
+ """Emergency text extraction for unsupported formats"""
666
+ try:
667
+ text = content.decode('utf-8', errors='ignore')
668
+ if len(text) > 50:
669
+ chunks = self._create_semantic_chunks(text, file_path, "unknown")
670
+ return chunks
671
+ except:
672
+ pass
673
+
674
+ return [{
675
+ "content": "Failed to extract content from document",
676
+ "metadata": {
677
+ "source": os.path.basename(file_path),
678
+ "chunk_index": 0,
679
+ "document_type": "error",
680
+ "error": True
681
+ },
682
+ "chunk_id": str(uuid.uuid4())
683
+ }]
684
+
685
+ # --- ENHANCED RAG PIPELINE ---
686
+
687
+ class UltimateRAGPipeline:
688
+ def __init__(self, collection_name: str, llm_manager: MultiLLMManager):
689
+ self.collection_name = collection_name
690
+ self.llm_manager = llm_manager
691
+ self.security_guard = SecurityGuard()
692
+
693
+ # Initialize embedding model (cached)
694
+ self.embedding_model = HuggingFaceEmbeddings(
695
+ model_name="BAAI/bge-small-en-v1.5",
696
+ model_kwargs={'device': 'cpu'},
697
+ encode_kwargs={'normalize_embeddings': True}
698
+ )
699
+
700
+ self.vectorstore = Chroma(
701
+ collection_name=collection_name,
702
+ embedding_function=self.embedding_model,
703
+ persist_directory="/tmp/chroma_ultimate"
704
+ )
705
+
706
+ logger.info(f"πŸš€ Ultimate RAG Pipeline initialized: {collection_name}")
707
+
708
+ async def add_documents(self, chunks: List[Dict[str, Any]]):
709
+ """Add documents with advanced filtering"""
710
  if not chunks:
 
711
  return
 
 
712
 
713
+ logger.info(f"πŸ“š Processing {len(chunks)} chunks...")
714
+
715
+ # Advanced quality filtering
716
  quality_chunks = []
717
  for chunk in chunks:
718
  content = chunk['content']
 
719
 
720
+ # Skip error chunks
721
+ if chunk['metadata'].get('error'):
722
+ continue
723
 
724
+ # Quality assessment
725
+ quality_score = 0
 
 
 
 
 
 
 
 
726
 
727
+ # Length factor
728
+ if 100 <= len(content) <= 2000:
729
+ quality_score += 2
730
+ elif len(content) > 50:
731
+ quality_score += 1
732
 
733
+ # Content richness
734
+ sentences = len(re.split(r'[.!?]+', content))
735
+ if sentences > 3:
736
+ quality_score += 1
737
 
738
+ # Numerical data (good for policies)
739
+ numbers = len(re.findall(r'\d+', content))
740
+ if numbers > 0:
741
+ quality_score += 1
742
 
743
+ if quality_score >= 2:
744
  quality_chunks.append(chunk)
 
 
 
745
 
746
+ logger.info(f"πŸ“š Filtered to {len(quality_chunks)} quality chunks")
747
+
748
+ # Convert to LangChain documents
749
+ documents = [
750
+ LangChainDocument(
751
+ page_content=chunk['content'],
752
+ metadata=chunk['metadata']
753
+ )
754
  for chunk in quality_chunks
755
  ]
756
+
757
+ # Add to vector store
758
+ if documents:
759
+ self.vectorstore.add_documents(documents)
760
+ logger.info(f"βœ… Added {len(documents)} documents to vector store")
761
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762
  async def answer_question(self, question: str) -> str:
763
+ """Answer question with security and quality checks"""
764
+ # Security check
765
+ if self.security_guard.detect_jailbreak(question):
766
+ return self.security_guard.sanitize_response(question, "")
767
+
768
  try:
769
+ # Enhanced retrieval
770
+ retriever = self.vectorstore.as_retriever(
771
+ search_type="mmr",
772
+ search_kwargs={
773
+ "k": 15, # More documents
774
+ "fetch_k": 30,
775
+ "lambda_mult": 0.5
776
+ }
777
+ )
778
 
779
+ relevant_docs = retriever.get_relevant_documents(question)
780
+
781
+ if not relevant_docs:
782
+ return "I don't have enough information in the provided documents to answer this question."
783
+
784
+ # Prepare context
785
+ context = "\n\n".join([doc.page_content for doc in relevant_docs])
786
+
787
+ # Create enhanced prompt
788
+ prompt = self._create_enhanced_prompt(context, question)
789
+
790
+ # Get response from multi-LLM system
791
+ response = await self.llm_manager.get_response(prompt)
792
+
793
+ # Final security check
794
+ response = self.security_guard.sanitize_response(question, response)
795
+
796
+ # Clean formatting
797
+ response = self._clean_response(response)
798
+
799
+ return response
800
 
 
 
 
801
  except Exception as e:
802
+ logger.error(f"❌ Question processing failed: {e}")
803
+ return "An error occurred while processing your question."
804
+
805
+ def _create_enhanced_prompt(self, context: str, question: str) -> str:
806
+ """Create enhanced prompt for better responses"""
807
+ return f"""You are an expert document analyst. Analyze the provided document context to answer the question accurately and professionally.
 
 
 
 
 
808
 
809
+ DOCUMENT CONTEXT:
810
+ {context}
 
 
 
 
 
811
 
812
+ QUESTION: {question}
813
 
814
+ INSTRUCTIONS:
815
+ - Provide accurate answers based ONLY on the document context
816
+ - Include specific details: numbers, percentages, dates, amounts, conditions
817
+ - Write in clear, professional language without excessive quotes
818
+ - If multiple conditions apply, list them clearly
819
+ - Be precise about limitations, exceptions, and requirements
820
+ - If information is incomplete, state what is available
821
+ - Do not make assumptions beyond what is stated in the documents
822
 
823
+ ANSWER:"""
824
+
825
+ def _clean_response(self, response: str) -> str:
826
+ """Clean response formatting"""
827
+ # Remove excessive quotes
828
+ response = re.sub(r'"([^"]{1,50})"', r'\1', response)
829
+ response = re.sub(r'"(\w+)"', r'\1', response)
 
 
 
 
 
 
 
 
830
 
831
+ # Fix spacing
832
+ response = re.sub(r'\s+', ' ', response)
833
+ response = response.replace(' ,', ',')
834
+ response = response.replace(' .', '.')
835
 
836
+ # Clean newlines
837
+ response = re.sub(r'\n\s*\n\s*\n+', '\n\n', response)
838
 
839
+ return response.strip()
840
+
841
+ # --- AUTHENTICATION ---
842
+
843
+ async def verify_bearer_token(authorization: str = Header(None)):
844
+ """Enhanced authentication with better logging"""
845
+ if not authorization:
846
+ raise HTTPException(status_code=401, detail="Authorization header required")
847
+
848
+ if not authorization.startswith("Bearer "):
849
+ raise HTTPException(status_code=401, detail="Invalid authorization format")
850
+
851
+ token = authorization.replace("Bearer ", "")
852
+
853
+ if len(token) < 10:
854
+ raise HTTPException(status_code=401, detail="Invalid token format")
855
+
856
+ logger.info(f"βœ… Authentication successful with token: {token[:10]}...")
857
+ return token
858
+
859
+ # --- GLOBAL INSTANCES ---
860
+
861
+ # Initialize global services
862
+ multi_llm = MultiLLMManager()
863
+ doc_processor = UniversalDocumentProcessor()
864
 
865
+ # --- API MODELS ---
866
 
867
  class SubmissionRequest(BaseModel):
868
  documents: List[str]
869
  questions: List[str]
870
 
 
 
 
 
 
 
 
 
871
  class SubmissionResponse(BaseModel):
872
+ answers: List[str]
 
 
 
 
 
 
 
 
 
 
873
 
874
+ # --- MAIN ENDPOINT ---
875
 
876
  @app.post("/hackrx/run", response_model=SubmissionResponse, dependencies=[Depends(verify_bearer_token)])
877
  async def run_submission(request: Request, submission_request: SubmissionRequest = Body(...)):
878
+ start_time = time.time()
879
+ logger.info(f"🎯 ULTIMATE PROCESSING: {len(submission_request.documents)} docs, {len(submission_request.questions)} questions")
880
+
881
  try:
882
+ # Create unique session
883
+ session_id = f"ultimate_{uuid.uuid4().hex[:8]}"
884
+ rag_pipeline = UltimateRAGPipeline(session_id, multi_llm)
 
 
885
 
886
+ # Process all documents concurrently
887
  all_chunks = []
888
+
889
+ async with httpx.AsyncClient(timeout=60.0) as client:
890
+ # Create semaphore to limit concurrent downloads
891
+ semaphore = asyncio.Semaphore(3)
892
+
893
+ async def process_single_document(doc_idx: int, doc_url: str):
894
+ async with semaphore:
895
+ try:
896
+ logger.info(f"πŸ“₯ Downloading document {doc_idx + 1}")
897
+ response = await client.get(doc_url, follow_redirects=True)
898
+ response.raise_for_status()
899
+
900
+ # Get filename from URL or generate one
901
+ filename = os.path.basename(doc_url.split('?')[0]) or f"document_{doc_idx}"
902
+
903
+ # Process document
904
+ chunks = await doc_processor.process_document(filename, response.content)
905
+
906
+ logger.info(f"βœ… Document {doc_idx + 1}: {len(chunks)} chunks")
907
+ return chunks
908
+
909
+ except Exception as e:
910
+ logger.error(f"❌ Document {doc_idx + 1} failed: {e}")
911
+ return []
912
+
913
+ # Process all documents concurrently
914
+ tasks = [
915
+ process_single_document(i, url)
916
+ for i, url in enumerate(submission_request.documents)
917
+ ]
918
+
919
+ results = await asyncio.gather(*tasks)
920
+
921
+ # Flatten results
922
+ for chunks in results:
923
+ all_chunks.extend(chunks)
924
+
925
+ logger.info(f"πŸ“Š Total chunks processed: {len(all_chunks)}")
926
+
927
  if not all_chunks:
928
+ logger.error("❌ No valid content extracted!")
 
929
  return SubmissionResponse(answers=[
930
+ "No valid content could be extracted from the provided documents."
931
+ for _ in submission_request.questions
932
  ])
 
 
 
 
 
 
 
933
 
934
+ # Add to RAG pipeline
935
+ await rag_pipeline.add_documents(all_chunks)
936
+
937
+ # Answer all questions concurrently
938
+ logger.info(f"❓ Answering questions...")
939
+
940
+ # Limit concurrent questions to avoid overwhelming the LLM
941
+ semaphore = asyncio.Semaphore(2)
942
 
943
+ async def answer_single_question(question: str) -> str:
944
+ async with semaphore:
945
+ return await rag_pipeline.answer_question(question)
946
+
947
+ tasks = [answer_single_question(q) for q in submission_request.questions]
948
+ answers = await asyncio.gather(*tasks)
949
+
950
+ elapsed = time.time() - start_time
951
+ logger.info(f"πŸŽ‰ ULTIMATE SUCCESS! Processed in {elapsed:.2f}s")
952
+
953
+ return SubmissionResponse(answers=answers)
954
 
955
  except Exception as e:
956
+ elapsed = time.time() - start_time
957
+ logger.error(f"πŸ’₯ CRITICAL ERROR after {elapsed:.2f}s: {e}")
958
+
959
  return SubmissionResponse(answers=[
960
+ f"Processing error occurred. Please try again."
961
+ for _ in submission_request.questions
962
  ])
963
 
964
+ # --- HEALTH ENDPOINTS ---
965
+
966
  @app.get("/")
967
  def read_root():
968
+ return {
969
+ "message": "πŸ† ULTIMATE HACKATHON RAG SYSTEM",
970
+ "version": "3.0.0",
971
+ "status": "READY TO WIN!",
972
+ "supported_formats": list(doc_processor.processors.keys()),
973
+ "features": [
974
+ "Multi-format document processing",
975
+ "Multi-LLM fallback system",
976
+ "Anti-jailbreak security",
977
+ "Smart caching",
978
+ "Concurrent processing",
979
+ "Semantic chunking"
980
+ ]
981
+ }
982
 
983
  @app.get("/health")
984
  def health_check():
985
+ return {
986
+ "status": "healthy",
987
+ "version": "3.0.0",
988
+ "cache_size": len(doc_processor.cache),
989
+ "timestamp": time.time()
990
+ }
991
+
992
+ # --- TESTING ENDPOINT ---
993
+
994
+ @app.post("/test")
995
+ async def test_endpoint(request: dict):
996
+ """Test endpoint for validation"""
997
+ return {
998
+ "status": "success",
999
+ "message": "Ultimate RAG system is operational",
1000
+ "processed_request": request
1001
+ }
requirements.txt CHANGED
@@ -46,4 +46,9 @@ python-magic==0.4.27
46
  # Core dependencies that might be missing
47
  typing-extensions==4.8.0
48
  requests==2.31.0
49
- certifi==2023.11.17
 
 
 
 
 
 
46
  # Core dependencies that might be missing
47
  typing-extensions==4.8.0
48
  requests==2.31.0
49
+ certifi==2023.11.17
50
+
51
+ openai
52
+ docx
53
+ google-generativeai
54
+ openpyxl