rohannsinghal commited on
Commit
de39ee0
·
1 Parent(s): c82e944
Files changed (4) hide show
  1. app/main_api.py +11 -1130
  2. app/main_api_backup.py +1136 -0
  3. requirements.txt +3 -55
  4. requirements_backup.txt +55 -0
app/main_api.py CHANGED
@@ -1,1136 +1,17 @@
1
- # --- KAGGLE-POWERED RAG SYSTEM - COMPLETE 1144+ LINES WITH DEADLOCK FIX ---
 
2
 
3
- import os
4
- import json
5
- import uuid
6
- import time
7
- import re
8
- import asyncio
9
- import logging
10
- import hashlib
11
- import httpx
12
- from typing import List, Dict, Any, Optional
13
- from collections import defaultdict
14
- from itertools import cycle
15
- from pathlib import Path
16
- import functools
17
- import threading
18
- import concurrent.futures
19
 
20
- # FastAPI and core dependencies
21
- from fastapi import FastAPI, Body, HTTPException, Request, Depends, Header
22
- from fastapi.middleware.cors import CORSMiddleware
23
- from pydantic import BaseModel
24
-
25
- # LangChain imports
26
- from langchain_community.vectorstores import Chroma
27
-
28
- # Multi-format document processing
29
- import fitz # PyMuPDF
30
- import pdfplumber
31
- import docx
32
- import openpyxl
33
- import csv
34
- import zipfile
35
- import email
36
- from email.policy import default
37
- from bs4 import BeautifulSoup
38
- import xml.etree.ElementTree as ET
39
-
40
- # LLM providers
41
- import groq
42
- import openai
43
- import google.generativeai as genai
44
-
45
- import cachetools
46
- from dotenv import load_dotenv
47
-
48
- # Setup
49
- load_dotenv()
50
- logging.basicConfig(level=logging.INFO)
51
- logger = logging.getLogger(__name__)
52
-
53
- app = FastAPI(title="Kaggle-Powered Hackathon RAG", version="5.4.0")
54
-
55
- app.add_middleware(
56
- CORSMiddleware,
57
- allow_origins=["*"],
58
- allow_credentials=True,
59
- allow_methods=["*"],
60
- allow_headers=["*", "ngrok-skip-browser-warning"],
61
- )
62
-
63
- # --- CRITICAL FIX: LAZY KAGGLE MODEL CLIENT ---
64
- class LazyKaggleModelClient:
65
- """LAZY INITIALIZATION: Only connects when actually needed - PREVENTS 'Preparing Space' ISSUE"""
66
- def __init__(self):
67
- self._client = None
68
- self._endpoint = None
69
- self._initialized = False
70
- logger.info("🎯 Lazy Kaggle Model Client created (no immediate connection)")
71
-
72
- def _initialize_if_needed(self):
73
- """Initialize client only when first API call is made"""
74
- if not self._initialized:
75
- # Get endpoint from Hugging Face Secrets (or fallback to env var)
76
- self._endpoint = os.getenv("KAGGLE_NGROK_URL") or os.getenv("KAGGLE_ENDPOINT", "")
77
-
78
- if not self._endpoint:
79
- logger.error("❌ No KAGGLE_NGROK_URL found in secrets or environment!")
80
- raise Exception("Kaggle endpoint not configured")
81
-
82
- self._endpoint = self._endpoint.rstrip('/')
83
- self._client = httpx.AsyncClient(
84
- timeout=30.0,
85
- headers={"ngrok-skip-browser-warning": "true"}
86
- )
87
- self._initialized = True
88
- logger.info(f"🎯 Lazy Kaggle client initialized: {self._endpoint}")
89
-
90
- async def health_check(self) -> bool:
91
- """Check if Kaggle model server is healthy"""
92
- try:
93
- self._initialize_if_needed()
94
- response = await self._client.get(f"{self._endpoint}/health")
95
- return response.status_code == 200
96
- except Exception as e:
97
- logger.error(f"Kaggle health check failed: {e}")
98
- return False
99
-
100
- async def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
101
- """Generate embeddings using Kaggle GPU"""
102
- try:
103
- self._initialize_if_needed()
104
- response = await self._client.post(
105
- f"{self._endpoint}/embed",
106
- json={"texts": texts}
107
- )
108
- response.raise_for_status()
109
- result = response.json()
110
- logger.info(f"🎯 Kaggle embeddings: {result.get('count', 0)} texts in {result.get('processing_time', 0):.2f}s")
111
- return result["embeddings"]
112
- except Exception as e:
113
- logger.error(f"Kaggle embedding error: {e}")
114
- return []
115
-
116
- async def rerank_documents(self, query: str, documents: List[str], k: int = 8) -> List[str]:
117
- """Rerank documents using Kaggle GPU"""
118
- try:
119
- self._initialize_if_needed()
120
- response = await self._client.post(
121
- f"{self._endpoint}/rerank",
122
- json={
123
- "query": query,
124
- "documents": documents,
125
- "k": k
126
- }
127
- )
128
- response.raise_for_status()
129
- result = response.json()
130
- logger.info(f"🎯 Kaggle reranking: {k} docs in {result.get('processing_time', 0):.2f}s")
131
- return result["reranked_documents"]
132
- except Exception as e:
133
- logger.error(f"Kaggle reranking error: {e}")
134
- return documents[:k]
135
-
136
- # --- LIGHTWEIGHT QUERY PROCESSOR (YOUR COMPLETE ORIGINAL) ---
137
- class LightweightQueryProcessor:
138
- def __init__(self, kaggle_client: LazyKaggleModelClient):
139
- self.kaggle_client = kaggle_client
140
- self.cache = cachetools.TTLCache(maxsize=500, ttl=3600)
141
-
142
- async def enhance_query_semantically(self, question: str, domain: str = "insurance") -> str:
143
- """OPTIMIZED semantic query processing"""
144
-
145
- # Quick cache check with shorter hash
146
- cache_key = hashlib.md5(question.encode()).hexdigest()[:8]
147
- if cache_key in self.cache:
148
- return self.cache[cache_key]
149
-
150
- # Streamlined domain expansion
151
- enhanced_query = self._expand_with_domain_knowledge_fast(question, domain)
152
- enhanced_query = self._handle_incomplete_questions(enhanced_query)
153
-
154
- # Cache result
155
- self.cache[cache_key] = enhanced_query
156
- return enhanced_query
157
-
158
- def _expand_with_domain_knowledge_fast(self, query: str, domain: str) -> str:
159
- """OPTIMIZED domain expansion - same intelligence, faster processing"""
160
-
161
- # Streamlined expansion mapping for speed
162
- key_expansions = {
163
- 'grace period': 'payment deadline premium due',
164
- 'waiting period': 'exclusion time coverage delay',
165
- 'pre-existing': 'prior medical condition',
166
- 'coverage': 'policy benefits protection',
167
- 'exclusion': 'limitations restrictions',
168
- 'premium': 'insurance cost payment',
169
- 'claim': 'benefit request reimbursement',
170
- 'ayush': 'alternative medicine treatment',
171
- 'hospital': 'healthcare facility medical center'
172
- }
173
-
174
- query_lower = query.lower()
175
- for key_term, expansion in key_expansions.items():
176
- if key_term in query_lower:
177
- return f"{query}. Also: {expansion}"
178
-
179
- return query
180
-
181
- def _handle_incomplete_questions(self, query: str) -> str:
182
- """Handle R4's 'half questions' requirement"""
183
- incomplete_patterns = [
184
- r'^(what|how|when|where|why)\s*\?*$',
185
- r'^(yes|no)\s*\?*$',
186
- r'^\w{1,3}\s*\?*$',
187
- r'^(this|that|it)\s*',
188
- ]
189
-
190
- query_lower = query.lower()
191
- is_incomplete = any(re.search(pattern, query_lower) for pattern in incomplete_patterns)
192
-
193
- if is_incomplete and len(query.split()) <= 2:
194
- return f"{query}. Please provide information about insurance policy terms, coverage, exclusions, waiting periods, or benefits."
195
-
196
- return query
197
-
198
- # --- ANTI-JAILBREAK SECURITY SYSTEM (YOUR COMPLETE ORIGINAL) ---
199
- class SecurityGuard:
200
- def __init__(self):
201
- self.jailbreak_patterns = [
202
- r'ignore.*previous.*instructions',
203
- r'act.*as.*different.*character',
204
- r'generate.*code.*(?:javascript|python|html)',
205
- r'write.*program',
206
- r'roleplay.*as',
207
- r'pretend.*you.*are',
208
- r'system.*prompt',
209
- r'override.*settings',
210
- r'bypass.*restrictions',
211
- r'admin.*mode',
212
- r'developer.*mode',
213
- r'tell.*me.*about.*yourself',
214
- r'what.*are.*you',
215
- r'who.*created.*you'
216
- ]
217
-
218
- def detect_jailbreak(self, text: str) -> bool:
219
- """Detect jailbreak attempts"""
220
- text_lower = text.lower()
221
- return any(re.search(pattern, text_lower) for pattern in self.jailbreak_patterns)
222
-
223
- def sanitize_response(self, question: str, answer: str) -> str:
224
- """Sanitize responses against jailbreaks"""
225
- if self.detect_jailbreak(question):
226
- return "I can only provide information based on the document content provided. Please ask questions about the document."
227
-
228
- # Remove any potential code or script tags
229
- answer = re.sub(r'<script.*?</script>', '', answer, flags=re.DOTALL | re.IGNORECASE)
230
- answer = re.sub(r'<.*?>', '', answer) # Remove HTML tags
231
-
232
- return answer
233
-
234
- # --- MULTI-LLM MANAGER (YOUR COMPLETE ORIGINAL WITH ALL PROVIDERS) ---
235
- class MultiLLMManager:
236
- def __init__(self):
237
- # Initialize multiple LLM providers with fallback
238
- self.providers = ['groq'] # Start with Groq as primary
239
-
240
- self.groq_keys = cycle([k.strip() for k in os.getenv("GROQ_API_KEYS", "").split(',') if k.strip()])
241
-
242
- # Optional paid providers (if keys available)
243
- openai_keys = [k.strip() for k in os.getenv("OPENAI_API_KEYS", "").split(',') if k.strip()]
244
- gemini_keys = [k.strip() for k in os.getenv("GEMINI_API_KEYS", "").split(',') if k.strip()]
245
-
246
- if openai_keys:
247
- self.providers.append('openai')
248
- self.openai_keys = cycle(openai_keys)
249
-
250
- if gemini_keys:
251
- self.providers.append('gemini')
252
- self.gemini_keys = cycle(gemini_keys)
253
-
254
- self.current_provider_index = 0
255
- logger.info(f"🔑 Multi-LLM Manager initialized with {len(self.providers)} providers")
256
-
257
- async def get_response(self, prompt: str, max_tokens: int = 900) -> str:
258
- """Get response with automatic fallback between providers"""
259
- for attempt in range(len(self.providers)):
260
- try:
261
- provider = self.providers[self.current_provider_index]
262
-
263
- if provider == 'groq':
264
- return await self._groq_response(prompt, max_tokens)
265
- elif provider == 'openai':
266
- return await self._openai_response(prompt, max_tokens)
267
- elif provider == 'gemini':
268
- return await self._gemini_response(prompt, max_tokens)
269
-
270
- except Exception as e:
271
- logger.warning(f"{provider} failed: {e}")
272
- self.current_provider_index = (self.current_provider_index + 1) % len(self.providers)
273
- continue
274
-
275
- return "Error: All LLM providers failed"
276
-
277
- async def _groq_response(self, prompt: str, max_tokens: int) -> str:
278
- key = next(self.groq_keys)
279
- client = groq.Groq(api_key=key)
280
-
281
- response = client.chat.completions.create(
282
- model="llama-3.3-70b-versatile",
283
- messages=[{"role": "user", "content": prompt}],
284
- temperature=0.1,
285
- max_tokens=max_tokens,
286
- top_p=0.9
287
- )
288
- return response.choices[0].message.content.strip()
289
-
290
- async def _openai_response(self, prompt: str, max_tokens: int) -> str:
291
- key = next(self.openai_keys)
292
- openai.api_key = key
293
-
294
- response = await openai.ChatCompletion.acreate(
295
- model="gpt-4o-mini",
296
- messages=[{"role": "user", "content": prompt}],
297
- temperature=0.1,
298
- max_tokens=max_tokens
299
- )
300
- return response.choices[0].message.content.strip()
301
-
302
- async def _gemini_response(self, prompt: str, max_tokens: int) -> str:
303
- key = next(self.gemini_keys)
304
- genai.configure(api_key=key)
305
-
306
- model = genai.GenerativeModel('gemini-pro')
307
- response = await model.generate_content_async(prompt)
308
- return response.text.strip()
309
-
310
- # --- COMPLETE UNIVERSAL DOCUMENT PROCESSOR (ALL YOUR ORIGINAL FEATURES) ---
311
- class UniversalDocumentProcessor:
312
- def __init__(self):
313
- # SPEED OPTIMIZATIONS: Reduced limits
314
- self.chunk_size = 1000 # Reduced from 1200
315
- self.chunk_overlap = 200
316
- self.max_chunks = 200 # Kept at 200 (good balance)
317
- self.max_pages = 18 # Reduced from 25
318
-
319
- # Smaller cache for speed
320
- self.cache = cachetools.TTLCache(maxsize=50, ttl=1800)
321
-
322
- # Supported formats (KEEPING all your excellent processors)
323
- self.processors = {
324
- '.pdf': self.process_pdf,
325
- '.docx': self.process_docx,
326
- '.doc': self.process_doc,
327
- '.xlsx': self.process_excel,
328
- '.xls': self.process_excel,
329
- '.csv': self.process_csv,
330
- '.txt': self.process_text,
331
- '.html': self.process_html,
332
- '.xml': self.process_xml,
333
- '.eml': self.process_email,
334
- '.zip': self.process_archive,
335
- '.json': self.process_json
336
- }
337
-
338
- logger.info("⚡ Speed-Optimized Universal Document Processor initialized")
339
-
340
- def get_file_hash(self, content: bytes) -> str:
341
- """Generate shorter hash for caching"""
342
- return hashlib.md5(content).hexdigest()[:8]
343
-
344
- async def process_document(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
345
- """Process any document format with optimized caching"""
346
- file_hash = self.get_file_hash(content)
347
-
348
- # Check cache first
349
- if file_hash in self.cache:
350
- logger.info(f"📦 Cache hit for {os.path.basename(file_path)}")
351
- return self.cache[file_hash]
352
-
353
- # Detect file type
354
- file_ext = Path(file_path).suffix.lower()
355
- if not file_ext:
356
- file_ext = self._detect_file_type(content)
357
-
358
- # Process based on file type
359
- processor = self.processors.get(file_ext, self.process_text)
360
-
361
- try:
362
- chunks = await processor(file_path, content)
363
-
364
- # Cache the result
365
- self.cache[file_hash] = chunks
366
-
367
- logger.info(f"✅ Processed {os.path.basename(file_path)}: {len(chunks)} chunks")
368
- return chunks
369
-
370
- except Exception as e:
371
- logger.error(f"❌ Processing failed for {file_path}: {e}")
372
- return self._emergency_text_extraction(content, file_path)
373
-
374
- def _detect_file_type(self, content: bytes) -> str:
375
- """Detect file type from content"""
376
- if content.startswith(b'%PDF'):
377
- return '.pdf'
378
- elif content.startswith(b'PK'):
379
- return '.docx' if b'word/' in content[:1000] else '.zip'
380
- elif content.startswith(b'<html') or content.startswith(b'<!DOCTYPE'):
381
- return '.html'
382
- elif content.startswith(b'<?xml'):
383
- return '.xml'
384
- else:
385
- return '.txt'
386
-
387
- # --- SPEED-OPTIMIZED PDF PROCESSING (YOUR COMPLETE ORIGINAL) ---
388
- async def process_pdf(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
389
- """Enhanced PDF processing with speed optimizations"""
390
- chunks = []
391
- temp_path = f"/tmp/{uuid.uuid4().hex[:6]}.pdf" # Shorter UUID
392
-
393
- with open(temp_path, 'wb') as f:
394
- f.write(content)
395
-
396
- try:
397
- # Extract text with PyMuPDF
398
- doc = fitz.open(temp_path)
399
- full_text = ""
400
-
401
- # SPEED OPTIMIZATION: Process fewer pages
402
- for page_num in range(min(len(doc), self.max_pages)):
403
- page = doc[page_num]
404
- text = page.get_text()
405
-
406
- if text.strip():
407
- full_text += f"\n\nPage {page_num + 1}:\n{self._clean_text(text)}"
408
-
409
- doc.close()
410
-
411
- # OPTIMIZED table extraction
412
- table_text = await self._extract_pdf_tables_fast(temp_path)
413
- if table_text:
414
- full_text += f"\n\n=== TABLES ===\n{table_text}"
415
-
416
- # Create semantic chunks
417
- chunks = self._create_semantic_chunks(full_text, file_path, "pdf")
418
-
419
- except Exception as e:
420
- logger.error(f"PDF processing error: {e}")
421
- chunks = self._emergency_text_extraction(content, file_path)
422
-
423
- finally:
424
- if os.path.exists(temp_path):
425
- os.remove(temp_path)
426
-
427
- return chunks
428
-
429
- async def _extract_pdf_tables_fast(self, file_path: str) -> str:
430
- """SPEED-OPTIMIZED table extraction"""
431
- table_text = ""
432
- try:
433
- with pdfplumber.open(file_path) as pdf:
434
- # SPEED OPTIMIZATION: Fewer pages and tables
435
- for page_num, page in enumerate(pdf.pages[:10]): # Reduced from 12
436
- tables = page.find_tables()
437
- for i, table in enumerate(tables[:1]): # Only 1 table per page
438
- try:
439
- table_data = table.extract()
440
- if table_data and len(table_data) > 1:
441
- table_md = f"\n**Table {i+1} (Page {page_num+1})**\n"
442
- for row in table_data[:12]: # Reduced from 15
443
- if row:
444
- clean_row = [str(cell or "").strip()[:30] for cell in row]
445
- table_md += "| " + " | ".join(clean_row) + " |\n"
446
- table_text += table_md + "\n"
447
- except:
448
- continue
449
- except Exception as e:
450
- logger.warning(f"Table extraction failed: {e}")
451
-
452
- return table_text
453
-
454
- # --- OTHER FORMAT PROCESSORS (ALL YOUR EXCELLENT FEATURES) ---
455
- async def process_docx(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
456
- """Process DOCX files"""
457
- temp_path = f"/tmp/{uuid.uuid4().hex[:6]}.docx"
458
- with open(temp_path, 'wb') as f:
459
- f.write(content)
460
-
461
- try:
462
- doc = docx.Document(temp_path)
463
- full_text = ""
464
-
465
- # Extract paragraphs
466
- for para in doc.paragraphs:
467
- if para.text.strip():
468
- full_text += para.text + "\n"
469
-
470
- # Extract tables
471
- for table in doc.tables:
472
- table_text = "\n**TABLE**\n"
473
- for row in table.rows:
474
- row_text = []
475
- for cell in row.cells:
476
- row_text.append(cell.text.strip())
477
- table_text += "| " + " | ".join(row_text) + " |\n"
478
- full_text += table_text + "\n"
479
-
480
- chunks = self._create_semantic_chunks(full_text, file_path, "docx")
481
-
482
- except Exception as e:
483
- logger.error(f"DOCX processing error: {e}")
484
- chunks = self._emergency_text_extraction(content, file_path)
485
-
486
- finally:
487
- if os.path.exists(temp_path):
488
- os.remove(temp_path)
489
-
490
- return chunks
491
-
492
- async def process_doc(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
493
- """Process DOC files (fallback to text extraction)"""
494
- return self._emergency_text_extraction(content, file_path)
495
-
496
- async def process_excel(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
497
- """Process Excel files"""
498
- temp_path = f"/tmp/{uuid.uuid4().hex[:6]}.xlsx"
499
- with open(temp_path, 'wb') as f:
500
- f.write(content)
501
-
502
- try:
503
- workbook = openpyxl.load_workbook(temp_path, read_only=True)
504
- full_text = ""
505
-
506
- for sheet_name in workbook.sheetnames[:3]:
507
- sheet = workbook[sheet_name]
508
- full_text += f"\n**Sheet: {sheet_name}**\n"
509
-
510
- for row_num, row in enumerate(sheet.iter_rows(max_row=50, values_only=True)):
511
- if row_num == 0 or any(cell for cell in row):
512
- row_text = [str(cell or "").strip()[:30] for cell in row[:8]]
513
- full_text += "| " + " | ".join(row_text) + " |\n"
514
-
515
- workbook.close()
516
- chunks = self._create_semantic_chunks(full_text, file_path, "excel")
517
-
518
- except Exception as e:
519
- logger.error(f"Excel processing error: {e}")
520
- chunks = self._emergency_text_extraction(content, file_path)
521
-
522
- finally:
523
- if os.path.exists(temp_path):
524
- os.remove(temp_path)
525
-
526
- return chunks
527
-
528
- # --- Other format processors (keeping all your excellent features) ---
529
- async def process_csv(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
530
- try:
531
- text_content = content.decode('utf-8', errors='ignore')
532
- lines = text_content.split('\n')
533
-
534
- full_text = "**CSV DATA**\n"
535
- for i, line in enumerate(lines[:100]):
536
- if line.strip():
537
- full_text += f"| {line} |\n"
538
-
539
- return self._create_semantic_chunks(full_text, file_path, "csv")
540
- except Exception as e:
541
- logger.error(f"CSV processing error: {e}")
542
- return []
543
-
544
- async def process_text(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
545
- try:
546
- text = content.decode('utf-8', errors='ignore')
547
- return self._create_semantic_chunks(text, file_path, "text")
548
- except Exception as e:
549
- logger.error(f"Text processing error: {e}")
550
- return []
551
-
552
- async def process_html(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
553
- try:
554
- soup = BeautifulSoup(content, 'html.parser')
555
- for script in soup(["script", "style"]):
556
- script.decompose()
557
- text = soup.get_text()
558
- return self._create_semantic_chunks(text, file_path, "html")
559
- except Exception as e:
560
- logger.error(f"HTML processing error: {e}")
561
- return []
562
-
563
- async def process_xml(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
564
- try:
565
- root = ET.fromstring(content)
566
- def extract_text(element, level=0):
567
- text = ""
568
- if element.text and element.text.strip():
569
- text += f"{' ' * level}{element.tag}: {element.text.strip()}\n"
570
- for child in element:
571
- text += extract_text(child, level + 1)
572
- return text
573
- full_text = extract_text(root)
574
- return self._create_semantic_chunks(full_text, file_path, "xml")
575
- except Exception as e:
576
- logger.error(f"XML processing error: {e}")
577
- return []
578
-
579
- async def process_email(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
580
- try:
581
- msg = email.message_from_bytes(content, policy=default)
582
- full_text = f"**EMAIL**\n"
583
- full_text += f"From: {msg.get('From', 'Unknown')}\n"
584
- full_text += f"Subject: {msg.get('Subject', 'No Subject')}\n\n"
585
-
586
- if msg.is_multipart():
587
- for part in msg.walk():
588
- if part.get_content_type() == "text/plain":
589
- body = part.get_content()
590
- full_text += f"Content:\n{body}\n"
591
- else:
592
- body = msg.get_content()
593
- full_text += f"Content:\n{body}\n"
594
-
595
- return self._create_semantic_chunks(full_text, file_path, "email")
596
- except Exception as e:
597
- logger.error(f"Email processing error: {e}")
598
- return []
599
-
600
- async def process_archive(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
601
- temp_path = f"/tmp/{uuid.uuid4().hex[:6]}.zip"
602
- with open(temp_path, 'wb') as f:
603
- f.write(content)
604
-
605
- chunks = []
606
- try:
607
- if file_path.endswith('.zip'):
608
- with zipfile.ZipFile(temp_path, 'r') as zip_file:
609
- for file_info in zip_file.filelist[:5]:
610
- try:
611
- file_content = zip_file.read(file_info)
612
- sub_chunks = await self.process_document(file_info.filename, file_content)
613
- chunks.extend(sub_chunks[:15]) # Limit sub-chunks for speed
614
- except:
615
- continue
616
- except Exception as e:
617
- logger.error(f"Archive processing error: {e}")
618
-
619
- finally:
620
- if os.path.exists(temp_path):
621
- os.remove(temp_path)
622
-
623
- return chunks
624
-
625
- async def process_json(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
626
- try:
627
- data = json.loads(content.decode('utf-8'))
628
- full_text = json.dumps(data, indent=2, ensure_ascii=False)
629
- return self._create_semantic_chunks(full_text, file_path, "json")
630
- except Exception as e:
631
- logger.error(f"JSON processing error: {e}")
632
- return []
633
-
634
- # --- UTILITY METHODS (YOUR EXCELLENT ORIGINAL) ---
635
- def _clean_text(self, text: str) -> str:
636
- """Clean extracted text"""
637
- # Remove excessive whitespace
638
- text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
639
- text = re.sub(r'\s+', ' ', text)
640
-
641
- # Remove noise patterns
642
- noise_patterns = [
643
- r'Office of.*Insurance Ombudsman.*?\n',
644
- r'Lalit Bhawan.*?\n',
645
- r'^\d+\s*$'
646
- ]
647
-
648
- for pattern in noise_patterns:
649
- text = re.sub(pattern, '', text, flags=re.MULTILINE)
650
-
651
- return text.strip()
652
-
653
- def _create_semantic_chunks(self, text: str, source: str, doc_type: str) -> List[Dict[str, Any]]:
654
- """Create semantic chunks from text"""
655
- text = self._clean_text(text)
656
-
657
- if not text or len(text) < 50:
658
- return []
659
-
660
- # Smart sentence-based chunking
661
- sentences = re.split(r'(?<=[.!?])\s+', text)
662
- chunks = []
663
- current_chunk = ""
664
-
665
- for sentence in sentences:
666
- if len(current_chunk) + len(sentence) <= self.chunk_size:
667
- current_chunk += sentence + " "
668
- else:
669
- if current_chunk.strip():
670
- chunks.append(current_chunk.strip())
671
- current_chunk = sentence + " "
672
-
673
- if current_chunk.strip():
674
- chunks.append(current_chunk.strip())
675
-
676
- # Convert to structured chunks
677
- structured_chunks = []
678
- for i, chunk_text in enumerate(chunks[:self.max_chunks]):
679
- structured_chunks.append({
680
- "content": chunk_text,
681
- "metadata": {
682
- "source": os.path.basename(source),
683
- "chunk_index": i,
684
- "document_type": doc_type,
685
- "chunk_length": len(chunk_text)
686
- },
687
- "chunk_id": str(uuid.uuid4())
688
- })
689
-
690
- return structured_chunks
691
-
692
- def _emergency_text_extraction(self, content: bytes, file_path: str) -> List[Dict[str, Any]]:
693
- """Emergency text extraction for unsupported formats"""
694
- try:
695
- text = content.decode('utf-8', errors='ignore')
696
- if len(text) > 50:
697
- return self._create_semantic_chunks(text, file_path, "unknown")
698
- except:
699
- pass
700
-
701
- return [{
702
- "content": "Failed to extract content from document",
703
- "metadata": {
704
- "source": os.path.basename(file_path),
705
- "chunk_index": 0,
706
- "document_type": "error",
707
- "error": True
708
- },
709
- "chunk_id": str(uuid.uuid4())
710
- }]
711
-
712
- # --- GEMINI'S FIX: DEADLOCK-FREE RAG PIPELINE ---
713
- class DeadlockFreeRAGPipeline:
714
- """FIXED: Direct embedding management - no more AsyncKaggleEmbeddingWrapper deadlock"""
715
- def __init__(self, collection_name: str, llm_manager: MultiLLMManager, kaggle_client: LazyKaggleModelClient):
716
- self.collection_name = collection_name
717
- self.llm_manager = llm_manager
718
- self.kaggle_client = kaggle_client
719
- self.security_guard = SecurityGuard()
720
- self.query_processor = LightweightQueryProcessor(kaggle_client)
721
-
722
- # GEMINI'S FIX: No embedding function - let Chroma be a simple data store
723
- self.vectorstore = Chroma(
724
- collection_name=collection_name,
725
- # REMOVED: embedding_function parameter completely
726
- persist_directory="/tmp/chroma_kaggle"
727
- )
728
-
729
- logger.info(f"🚀 Deadlock-Free RAG Pipeline initialized: {collection_name}")
730
-
731
- async def add_documents(self, chunks: List[Dict[str, Any]]):
732
- """GEMINI'S FIX: Direct embedding management - no deadlock"""
733
- if not chunks:
734
- return
735
-
736
- logger.info(f"📚 Processing {len(chunks)} chunks...")
737
-
738
- # Advanced quality filtering (YOUR EXCELLENT ORIGINAL LOGIC)
739
- quality_chunks = []
740
- for chunk in chunks:
741
- content = chunk['content']
742
-
743
- # Skip error chunks
744
- if chunk['metadata'].get('error'):
745
- continue
746
-
747
- # Quality assessment
748
- quality_score = 0
749
-
750
- # Length factor
751
- if 100 <= len(content) <= 2000:
752
- quality_score += 2
753
- elif len(content) > 50:
754
- quality_score += 1
755
-
756
- # Content richness
757
- sentences = len(re.split(r'[.!?]+', content))
758
- if sentences > 3:
759
- quality_score += 1
760
-
761
- # Numerical data (good for policies)
762
- numbers = len(re.findall(r'\d+', content))
763
- if numbers > 0:
764
- quality_score += 1
765
-
766
- if quality_score >= 2:
767
- quality_chunks.append(chunk)
768
-
769
- logger.info(f"📚 Filtered to {len(quality_chunks)} quality chunks")
770
-
771
- if not quality_chunks:
772
- return
773
-
774
- # GEMINI'S FIX: Step 1 - Get texts
775
- texts = [chunk['content'] for chunk in quality_chunks[:100]] # Reduced from 150 for speed
776
-
777
- # GEMINI'S FIX: Step 2 - Embed all texts via Kaggle (Manager gets sauce first)
778
- logger.info(f"🚀 Embedding {len(texts)} chunks via Kaggle...")
779
- embeddings = await self.kaggle_client.generate_embeddings(texts)
780
-
781
- if not embeddings or len(embeddings) != len(texts):
782
- logger.error("Embedding failed or returned mismatched count.")
783
- return
784
-
785
- # GEMINI'S FIX: Step 3 - Add to Chroma with pre-calculated embeddings
786
- # This completely avoids the deadlock!
787
- self.vectorstore.add_texts(
788
- texts=texts,
789
- metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]],
790
- embeddings=embeddings # Pass vectors directly - no async calls in Chroma!
791
- )
792
-
793
- logger.info(f"✅ Added {len(texts)} documents with embeddings to vector store (DEADLOCK-FREE)")
794
-
795
- async def answer_question(self, question: str) -> str:
796
- """GEMINI'S FIX: Direct query embedding - no deadlock"""
797
- # Security check
798
- if self.security_guard.detect_jailbreak(question):
799
- return self.security_guard.sanitize_response(question, "")
800
-
801
- try:
802
- # Enhanced query processing
803
- enhanced_question = await self.query_processor.enhance_query_semantically(question)
804
-
805
- # GEMINI'S FIX: Step 1 - Embed the query yourself first (Manager gets sauce)
806
- query_embedding_list = await self.kaggle_client.generate_embeddings([enhanced_question])
807
- if not query_embedding_list:
808
- return "I could not process the query for searching."
809
-
810
- query_embedding = query_embedding_list[0]
811
-
812
- # GEMINI'S FIX: Step 2 - Search using vector directly (no async calls in Chroma)
813
- relevant_docs = self.vectorstore.similarity_search_by_vector(
814
- embedding=query_embedding,
815
- k=15
816
- )
817
-
818
- if not relevant_docs:
819
- return "I don't have sufficient information to answer this question based on the provided documents."
820
-
821
- # Use Kaggle GPU for reranking (GAME CHANGER)
822
- doc_contents = [doc.page_content for doc in relevant_docs]
823
-
824
- if await self.kaggle_client.health_check():
825
- logger.info("🎯 Using Kaggle GPU for reranking")
826
- top_docs_content = await self.kaggle_client.rerank_documents(
827
- enhanced_question, doc_contents, k=6
828
- )
829
- else:
830
- logger.warning("📦 Kaggle unavailable, using first 6 docs")
831
- top_docs_content = doc_contents[:6]
832
-
833
- # Prepare enhanced context
834
- context = "\n\n".join(top_docs_content)
835
-
836
- # Create advanced semantic prompt
837
- prompt = self._create_advanced_prompt(context, question)
838
-
839
- # Get response from multi-LLM system
840
- response = await self.llm_manager.get_response(prompt)
841
-
842
- # Final security check and cleaning
843
- response = self.security_guard.sanitize_response(question, response)
844
- response = self._clean_response(response)
845
-
846
- return response
847
-
848
- except Exception as e:
849
- logger.error(f"❌ Question processing failed: {e}")
850
- return "An error occurred while processing your question."
851
-
852
- def _create_advanced_prompt(self, context: str, question: str) -> str:
853
- """Create advanced semantic-aware prompt (YOUR EXCELLENT ORIGINAL)"""
854
- return f"""You are an expert insurance policy analyst with advanced semantic understanding.
855
-
856
- CONTEXT ANALYSIS FRAMEWORK:
857
- - Apply deep semantic understanding to connect related concepts across documents
858
- - Recognize implicit relationships and cross-references within policy content
859
- - Understand hierarchical information structures and conditional dependencies
860
- - Synthesize information from multiple sources with semantic coherence
861
-
862
- DOCUMENT CONTEXT:
863
- {context}
864
-
865
- QUESTION: {question}
866
-
867
- ADVANCED REASONING APPROACH:
868
- 1. SEMANTIC COMPREHENSION: Understand the full meaning and intent behind the question
869
- 2. CONTEXTUAL MAPPING: Map question elements to semantically relevant sections
870
- 3. RELATIONSHIP INFERENCE: Identify implicit connections between policy components
871
- 4. MULTI-SOURCE SYNTHESIS: Combine information while maintaining semantic consistency
872
- 5. CONDITIONAL REASONING: Apply logical reasoning to policy exceptions and conditions
873
-
874
- RESPONSE REQUIREMENTS:
875
- - Provide semantically rich, contextually grounded answers
876
- - Include specific details: numbers, percentages, timeframes, conditions
877
- - Write in clear, professional language without excessive quotes
878
- - Address both explicit information and reasonable semantic inferences
879
- - Structure information hierarchically when appropriate
880
-
881
- ANSWER:"""
882
-
883
- def _clean_response(self, response: str) -> str:
884
- """Enhanced response cleaning (YOUR EXCELLENT ORIGINAL)"""
885
- # Remove excessive quotes
886
- response = re.sub(r'"([^"]{1,50})"', r'\1', response)
887
- response = re.sub(r'"(\w+)"', r'\1', response)
888
- response = re.sub(r'"(Rs\.?\s*[\d,]+[/-]*)"', r'\1', response)
889
- response = re.sub(r'"(\d+%)"', r'\1', response)
890
- response = re.sub(r'"(\d+\s*(?:days?|months?|years?))"', r'\1', response)
891
-
892
- # Clean policy references
893
- response = re.sub(r'[Aa]s stated in the policy[:\s]*"([^"]+)"', r'As per the policy, \1', response)
894
- response = re.sub(r'[Aa]ccording to the policy[:\s]*"([^"]+)"', r'According to the policy, \1', response)
895
- response = re.sub(r'[Tt]he policy states[:\s]*"([^"]+)"', r'The policy states that \1', response)
896
-
897
- # Fix spacing and formatting
898
- response = re.sub(r'\s+', ' ', response)
899
- response = response.replace(' ,', ',')
900
- response = response.replace(' .', '.')
901
- response = re.sub(r'\n\s*\n\s*\n+', '\n\n', response)
902
-
903
- return response.strip()
904
-
905
- # --- AUTHENTICATION (YOUR EXCELLENT ORIGINAL) ---
906
- async def verify_bearer_token(authorization: str = Header(None)):
907
- """Enhanced authentication with better logging"""
908
- if not authorization:
909
- raise HTTPException(status_code=401, detail="Authorization header required")
910
-
911
- if not authorization.startswith("Bearer "):
912
- raise HTTPException(status_code=401, detail="Invalid authorization format")
913
-
914
- token = authorization.replace("Bearer ", "")
915
-
916
- if len(token) < 10:
917
- raise HTTPException(status_code=401, detail="Invalid token format")
918
-
919
- logger.info(f"✅ Authentication successful with token: {token[:10]}...")
920
- return token
921
-
922
- # --- GLOBAL INSTANCES (NO EARLY KAGGLE CONNECTION!) ---
923
- multi_llm = MultiLLMManager()
924
- doc_processor = UniversalDocumentProcessor()
925
-
926
- # CRITICAL: Create lazy client (no immediate connection!)
927
- kaggle_client = LazyKaggleModelClient()
928
-
929
- # --- API MODELS ---
930
- class SubmissionRequest(BaseModel):
931
- documents: List[str]
932
- questions: List[str]
933
-
934
- class SubmissionResponse(BaseModel):
935
- answers: List[str]
936
-
937
- # --- FIXED: BOTH GET AND POST ENDPOINTS FOR /api/v1/hackrx/run ---
938
- @app.get("/api/v1/hackrx/run")
939
- def test_endpoint():
940
- """GET endpoint for testing - fixes 405 Method Not Allowed error"""
941
- return {
942
- "message": "This endpoint requires POST method",
943
- "usage": "Send POST request with documents and questions",
944
- "status": "API is running - DEADLOCK-FREE with lazy initialization",
945
- "kaggle_connection": "Will initialize on first request",
946
- "fix": "Direct embedding management prevents async deadlocks",
947
- "method": "Use POST with JSON body",
948
- "example": {
949
- "documents": ["url1", "url2"],
950
- "questions": ["question1", "question2"]
951
- }
952
- }
953
-
954
- # --- SPEED-OPTIMIZED MAIN ENDPOINT WITH GEMINI'S DEADLOCK FIX ---
955
- @app.post("/api/v1/hackrx/run", response_model=SubmissionResponse, dependencies=[Depends(verify_bearer_token)])
956
- async def run_submission(request: Request, submission_request: SubmissionRequest = Body(...)):
957
- start_time = time.time()
958
- logger.info(f"🎯 DEADLOCK-FREE KAGGLE-POWERED PROCESSING: {len(submission_request.documents)} docs, {len(submission_request.questions)} questions")
959
-
960
- try:
961
- # LAZY INITIALIZATION: Only now do we connect to Kaggle!
962
- logger.info("🔄 Initializing Kaggle connection (lazy initialization)...")
963
-
964
- # Check Kaggle health (this will trigger initialization)
965
- if not await kaggle_client.health_check():
966
- logger.error("❌ Kaggle endpoint not available!")
967
- return SubmissionResponse(answers=[
968
- "Model service unavailable" for _ in submission_request.questions
969
- ])
970
-
971
- # Create unique session with DEADLOCK-FREE pipeline
972
- session_id = f"kaggle_{uuid.uuid4().hex[:6]}" # Shorter UUID
973
- rag_pipeline = DeadlockFreeRAGPipeline(session_id, multi_llm, kaggle_client)
974
-
975
- # Process all documents with higher concurrency
976
- all_chunks = []
977
-
978
- async with httpx.AsyncClient(
979
- timeout=45.0,
980
- headers={"ngrok-skip-browser-warning": "true"}
981
- ) as client: # Tighter timeout + ngrok header
982
- # SPEED OPTIMIZATION: Higher concurrency
983
- semaphore = asyncio.Semaphore(5) # Increased from 3
984
-
985
- async def process_single_document(doc_idx: int, doc_url: str):
986
- async with semaphore:
987
- try:
988
- logger.info(f"📥 Downloading document {doc_idx + 1}")
989
- response = await client.get(doc_url, follow_redirects=True)
990
- response.raise_for_status()
991
-
992
- # Get filename from URL or generate one
993
- filename = os.path.basename(doc_url.split('?')[0]) or f"document_{doc_idx}"
994
-
995
- # Process document with caching
996
- chunks = await doc_processor.process_document(filename, response.content)
997
-
998
- logger.info(f"✅ Document {doc_idx + 1}: {len(chunks)} chunks")
999
- return chunks
1000
-
1001
- except Exception as e:
1002
- logger.error(f"❌ Document {doc_idx + 1} failed: {e}")
1003
- return []
1004
-
1005
- # Process all documents concurrently
1006
- tasks = [
1007
- process_single_document(i, url)
1008
- for i, url in enumerate(submission_request.documents)
1009
- ]
1010
-
1011
- results = await asyncio.gather(*tasks)
1012
-
1013
- # Flatten results
1014
- for chunks in results:
1015
- all_chunks.extend(chunks)
1016
-
1017
- logger.info(f"📊 Total chunks processed: {len(all_chunks)}")
1018
-
1019
- if not all_chunks:
1020
- logger.error("❌ No valid content extracted!")
1021
- return SubmissionResponse(answers=[
1022
- "No valid content could be extracted from the provided documents."
1023
- for _ in submission_request.questions
1024
- ])
1025
-
1026
- # Add to RAG pipeline with DEADLOCK-FREE processing
1027
- await rag_pipeline.add_documents(all_chunks)
1028
-
1029
- # SPEED OPTIMIZATION: Full parallel question answering
1030
- logger.info(f"⚡ Answering questions in parallel...")
1031
-
1032
- # INCREASED concurrency for questions
1033
- semaphore = asyncio.Semaphore(4) # Increased from 2
1034
-
1035
- async def answer_single_question(question: str) -> str:
1036
- async with semaphore:
1037
- return await rag_pipeline.answer_question(question)
1038
-
1039
- tasks = [answer_single_question(q) for q in submission_request.questions]
1040
- answers = await asyncio.gather(*tasks)
1041
-
1042
- elapsed = time.time() - start_time
1043
- logger.info(f"🎉 DEADLOCK-FREE KAGGLE-POWERED SUCCESS! Processed in {elapsed:.2f}s")
1044
-
1045
- return SubmissionResponse(answers=answers)
1046
-
1047
- except Exception as e:
1048
- elapsed = time.time() - start_time
1049
- logger.error(f"💥 CRITICAL ERROR after {elapsed:.2f}s: {e}")
1050
-
1051
- return SubmissionResponse(answers=[
1052
- "Processing error occurred. Please try again."
1053
- for _ in submission_request.questions
1054
- ])
1055
-
1056
- # --- HEALTH ENDPOINTS (YOUR EXCELLENT ORIGINAL + DEADLOCK-FREE INFO) ---
1057
  @app.get("/")
1058
  def read_root():
1059
- return {
1060
- "message": "🎯 KAGGLE-POWERED HACKATHON RAG SYSTEM - DEADLOCK-FREE COMPLETE VERSION",
1061
- "version": "5.4.0",
1062
- "status": "FIXED: Deadlock-free + lazy initialization prevents all issues!",
1063
- "target_time": "<20 seconds with Kaggle GPU",
1064
- "supported_formats": list(doc_processor.processors.keys()),
1065
- "features": [
1066
- "Multi-format document processing (PDF, DOCX, Excel, CSV, HTML, etc.)",
1067
- "Kaggle GPU-powered embeddings and reranking",
1068
- "Multi-LLM fallback system (Groq, OpenAI, Gemini)",
1069
- "Advanced semantic query enhancement",
1070
- "Anti-jailbreak security system",
1071
- "Optimized caching and concurrent processing",
1072
- "Semantic chunking and context fusion",
1073
- "R4 'half questions' handling",
1074
- "Lightning-fast GPU-accelerated response times",
1075
- "DEADLOCK-FREE async operations",
1076
- "Lazy initialization prevents startup timeouts",
1077
- "Direct embedding management"
1078
- ],
1079
- "kaggle_connection": "Lazy (connects on first API call)",
1080
- "embedding_method": "Direct Kaggle management (no wrapper deadlock)",
1081
- "fixes": [
1082
- "DeadlockFreeRAGPipeline prevents async conflicts",
1083
- "LazyKaggleModelClient prevents startup connection",
1084
- "Direct embedding calls to Kaggle (no AsyncWrapper)",
1085
- "Chroma as simple data store (no embedding function)",
1086
- "CORS headers with ngrok-skip-browser-warning",
1087
- "Both GET and POST endpoints for /api/v1/hackrx/run",
1088
- "Improved error handling and logging",
1089
- "Hugging Face Secrets support for dynamic URLs"
1090
- ]
1091
- }
1092
 
1093
- @app.get("/health")
1094
- def health_check():
 
1095
  return {
1096
- "status": "healthy",
1097
- "version": "5.4.0",
1098
- "mode": "DEADLOCK_FREE_KAGGLE_GPU_POWERED_LAZY",
1099
- "cache_size": len(doc_processor.cache),
1100
- "kaggle_connection": "lazy (on-demand)",
1101
- "embedding_method": "direct_kaggle_management",
1102
- "timestamp": time.time(),
1103
- "fixes_applied": [
1104
- "deadlock_free_pipeline",
1105
- "lazy_initialization",
1106
- "direct_embedding_management",
1107
- "ngrok_compatibility",
1108
- "http_method_fix",
1109
- "cors_headers",
1110
- "hf_secrets_support"
1111
- ]
1112
- }
1113
-
1114
- @app.get("/test-kaggle")
1115
- async def test_kaggle_connection():
1116
- """Test endpoint to check Kaggle connection (will trigger lazy initialization)"""
1117
- try:
1118
- is_healthy = await kaggle_client.health_check()
1119
- return {
1120
- "kaggle_connection": "initialized" if kaggle_client._initialized else "not_initialized",
1121
- "health_status": "healthy" if is_healthy else "unhealthy",
1122
- "endpoint": kaggle_client._endpoint if kaggle_client._initialized else "not_set",
1123
- "timestamp": time.time()
1124
- }
1125
- except Exception as e:
1126
- return {
1127
- "kaggle_connection": "failed",
1128
- "health_status": "error",
1129
- "error": str(e),
1130
- "timestamp": time.time()
1131
- }
1132
-
1133
- # --- RUN SERVER ---
1134
- if __name__ == "__main__":
1135
- import uvicorn
1136
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ # --- Minimal Test App ---
2
+ from fastapi import FastAPI, Body
3
 
4
+ app = FastAPI(title="Minimal Test App")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  @app.get("/")
7
  def read_root():
8
+ return {"status": "Minimal App is Running!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ @app.post("/api/v1/hackrx/run")
11
+ def post_test(data: dict = Body(...)):
12
+ # This just confirms we received the POST request and echoes back a count
13
  return {
14
+ "status": "POST request successful!",
15
+ "received_documents": len(data.get("documents", [])),
16
+ "received_questions": len(data.get("questions", []))
17
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/main_api_backup.py ADDED
@@ -0,0 +1,1136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- KAGGLE-POWERED RAG SYSTEM - COMPLETE 1144+ LINES WITH DEADLOCK FIX ---
2
+
3
+ import os
4
+ import json
5
+ import uuid
6
+ import time
7
+ import re
8
+ import asyncio
9
+ import logging
10
+ import hashlib
11
+ import httpx
12
+ from typing import List, Dict, Any, Optional
13
+ from collections import defaultdict
14
+ from itertools import cycle
15
+ from pathlib import Path
16
+ import functools
17
+ import threading
18
+ import concurrent.futures
19
+
20
+ # FastAPI and core dependencies
21
+ from fastapi import FastAPI, Body, HTTPException, Request, Depends, Header
22
+ from fastapi.middleware.cors import CORSMiddleware
23
+ from pydantic import BaseModel
24
+
25
+ # LangChain imports
26
+ from langchain_community.vectorstores import Chroma
27
+
28
+ # Multi-format document processing
29
+ import fitz # PyMuPDF
30
+ import pdfplumber
31
+ import docx
32
+ import openpyxl
33
+ import csv
34
+ import zipfile
35
+ import email
36
+ from email.policy import default
37
+ from bs4 import BeautifulSoup
38
+ import xml.etree.ElementTree as ET
39
+
40
+ # LLM providers
41
+ import groq
42
+ import openai
43
+ import google.generativeai as genai
44
+
45
+ import cachetools
46
+ from dotenv import load_dotenv
47
+
48
+ # Setup
49
+ load_dotenv()
50
+ logging.basicConfig(level=logging.INFO)
51
+ logger = logging.getLogger(__name__)
52
+
53
+ app = FastAPI(title="Kaggle-Powered Hackathon RAG", version="5.4.0")
54
+
55
+ app.add_middleware(
56
+ CORSMiddleware,
57
+ allow_origins=["*"],
58
+ allow_credentials=True,
59
+ allow_methods=["*"],
60
+ allow_headers=["*", "ngrok-skip-browser-warning"],
61
+ )
62
+
63
+ # --- CRITICAL FIX: LAZY KAGGLE MODEL CLIENT ---
64
+ class LazyKaggleModelClient:
65
+ """LAZY INITIALIZATION: Only connects when actually needed - PREVENTS 'Preparing Space' ISSUE"""
66
+ def __init__(self):
67
+ self._client = None
68
+ self._endpoint = None
69
+ self._initialized = False
70
+ logger.info("🎯 Lazy Kaggle Model Client created (no immediate connection)")
71
+
72
+ def _initialize_if_needed(self):
73
+ """Initialize client only when first API call is made"""
74
+ if not self._initialized:
75
+ # Get endpoint from Hugging Face Secrets (or fallback to env var)
76
+ self._endpoint = os.getenv("KAGGLE_NGROK_URL") or os.getenv("KAGGLE_ENDPOINT", "")
77
+
78
+ if not self._endpoint:
79
+ logger.error("❌ No KAGGLE_NGROK_URL found in secrets or environment!")
80
+ raise Exception("Kaggle endpoint not configured")
81
+
82
+ self._endpoint = self._endpoint.rstrip('/')
83
+ self._client = httpx.AsyncClient(
84
+ timeout=30.0,
85
+ headers={"ngrok-skip-browser-warning": "true"}
86
+ )
87
+ self._initialized = True
88
+ logger.info(f"🎯 Lazy Kaggle client initialized: {self._endpoint}")
89
+
90
+ async def health_check(self) -> bool:
91
+ """Check if Kaggle model server is healthy"""
92
+ try:
93
+ self._initialize_if_needed()
94
+ response = await self._client.get(f"{self._endpoint}/health")
95
+ return response.status_code == 200
96
+ except Exception as e:
97
+ logger.error(f"Kaggle health check failed: {e}")
98
+ return False
99
+
100
+ async def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
101
+ """Generate embeddings using Kaggle GPU"""
102
+ try:
103
+ self._initialize_if_needed()
104
+ response = await self._client.post(
105
+ f"{self._endpoint}/embed",
106
+ json={"texts": texts}
107
+ )
108
+ response.raise_for_status()
109
+ result = response.json()
110
+ logger.info(f"🎯 Kaggle embeddings: {result.get('count', 0)} texts in {result.get('processing_time', 0):.2f}s")
111
+ return result["embeddings"]
112
+ except Exception as e:
113
+ logger.error(f"Kaggle embedding error: {e}")
114
+ return []
115
+
116
+ async def rerank_documents(self, query: str, documents: List[str], k: int = 8) -> List[str]:
117
+ """Rerank documents using Kaggle GPU"""
118
+ try:
119
+ self._initialize_if_needed()
120
+ response = await self._client.post(
121
+ f"{self._endpoint}/rerank",
122
+ json={
123
+ "query": query,
124
+ "documents": documents,
125
+ "k": k
126
+ }
127
+ )
128
+ response.raise_for_status()
129
+ result = response.json()
130
+ logger.info(f"🎯 Kaggle reranking: {k} docs in {result.get('processing_time', 0):.2f}s")
131
+ return result["reranked_documents"]
132
+ except Exception as e:
133
+ logger.error(f"Kaggle reranking error: {e}")
134
+ return documents[:k]
135
+
136
+ # --- LIGHTWEIGHT QUERY PROCESSOR (YOUR COMPLETE ORIGINAL) ---
137
+ class LightweightQueryProcessor:
138
+ def __init__(self, kaggle_client: LazyKaggleModelClient):
139
+ self.kaggle_client = kaggle_client
140
+ self.cache = cachetools.TTLCache(maxsize=500, ttl=3600)
141
+
142
+ async def enhance_query_semantically(self, question: str, domain: str = "insurance") -> str:
143
+ """OPTIMIZED semantic query processing"""
144
+
145
+ # Quick cache check with shorter hash
146
+ cache_key = hashlib.md5(question.encode()).hexdigest()[:8]
147
+ if cache_key in self.cache:
148
+ return self.cache[cache_key]
149
+
150
+ # Streamlined domain expansion
151
+ enhanced_query = self._expand_with_domain_knowledge_fast(question, domain)
152
+ enhanced_query = self._handle_incomplete_questions(enhanced_query)
153
+
154
+ # Cache result
155
+ self.cache[cache_key] = enhanced_query
156
+ return enhanced_query
157
+
158
+ def _expand_with_domain_knowledge_fast(self, query: str, domain: str) -> str:
159
+ """OPTIMIZED domain expansion - same intelligence, faster processing"""
160
+
161
+ # Streamlined expansion mapping for speed
162
+ key_expansions = {
163
+ 'grace period': 'payment deadline premium due',
164
+ 'waiting period': 'exclusion time coverage delay',
165
+ 'pre-existing': 'prior medical condition',
166
+ 'coverage': 'policy benefits protection',
167
+ 'exclusion': 'limitations restrictions',
168
+ 'premium': 'insurance cost payment',
169
+ 'claim': 'benefit request reimbursement',
170
+ 'ayush': 'alternative medicine treatment',
171
+ 'hospital': 'healthcare facility medical center'
172
+ }
173
+
174
+ query_lower = query.lower()
175
+ for key_term, expansion in key_expansions.items():
176
+ if key_term in query_lower:
177
+ return f"{query}. Also: {expansion}"
178
+
179
+ return query
180
+
181
+ def _handle_incomplete_questions(self, query: str) -> str:
182
+ """Handle R4's 'half questions' requirement"""
183
+ incomplete_patterns = [
184
+ r'^(what|how|when|where|why)\s*\?*$',
185
+ r'^(yes|no)\s*\?*$',
186
+ r'^\w{1,3}\s*\?*$',
187
+ r'^(this|that|it)\s*',
188
+ ]
189
+
190
+ query_lower = query.lower()
191
+ is_incomplete = any(re.search(pattern, query_lower) for pattern in incomplete_patterns)
192
+
193
+ if is_incomplete and len(query.split()) <= 2:
194
+ return f"{query}. Please provide information about insurance policy terms, coverage, exclusions, waiting periods, or benefits."
195
+
196
+ return query
197
+
198
+ # --- ANTI-JAILBREAK SECURITY SYSTEM (YOUR COMPLETE ORIGINAL) ---
199
+ class SecurityGuard:
200
+ def __init__(self):
201
+ self.jailbreak_patterns = [
202
+ r'ignore.*previous.*instructions',
203
+ r'act.*as.*different.*character',
204
+ r'generate.*code.*(?:javascript|python|html)',
205
+ r'write.*program',
206
+ r'roleplay.*as',
207
+ r'pretend.*you.*are',
208
+ r'system.*prompt',
209
+ r'override.*settings',
210
+ r'bypass.*restrictions',
211
+ r'admin.*mode',
212
+ r'developer.*mode',
213
+ r'tell.*me.*about.*yourself',
214
+ r'what.*are.*you',
215
+ r'who.*created.*you'
216
+ ]
217
+
218
+ def detect_jailbreak(self, text: str) -> bool:
219
+ """Detect jailbreak attempts"""
220
+ text_lower = text.lower()
221
+ return any(re.search(pattern, text_lower) for pattern in self.jailbreak_patterns)
222
+
223
+ def sanitize_response(self, question: str, answer: str) -> str:
224
+ """Sanitize responses against jailbreaks"""
225
+ if self.detect_jailbreak(question):
226
+ return "I can only provide information based on the document content provided. Please ask questions about the document."
227
+
228
+ # Remove any potential code or script tags
229
+ answer = re.sub(r'<script.*?</script>', '', answer, flags=re.DOTALL | re.IGNORECASE)
230
+ answer = re.sub(r'<.*?>', '', answer) # Remove HTML tags
231
+
232
+ return answer
233
+
234
+ # --- MULTI-LLM MANAGER (YOUR COMPLETE ORIGINAL WITH ALL PROVIDERS) ---
235
+ class MultiLLMManager:
236
+ def __init__(self):
237
+ # Initialize multiple LLM providers with fallback
238
+ self.providers = ['groq'] # Start with Groq as primary
239
+
240
+ self.groq_keys = cycle([k.strip() for k in os.getenv("GROQ_API_KEYS", "").split(',') if k.strip()])
241
+
242
+ # Optional paid providers (if keys available)
243
+ openai_keys = [k.strip() for k in os.getenv("OPENAI_API_KEYS", "").split(',') if k.strip()]
244
+ gemini_keys = [k.strip() for k in os.getenv("GEMINI_API_KEYS", "").split(',') if k.strip()]
245
+
246
+ if openai_keys:
247
+ self.providers.append('openai')
248
+ self.openai_keys = cycle(openai_keys)
249
+
250
+ if gemini_keys:
251
+ self.providers.append('gemini')
252
+ self.gemini_keys = cycle(gemini_keys)
253
+
254
+ self.current_provider_index = 0
255
+ logger.info(f"🔑 Multi-LLM Manager initialized with {len(self.providers)} providers")
256
+
257
+ async def get_response(self, prompt: str, max_tokens: int = 900) -> str:
258
+ """Get response with automatic fallback between providers"""
259
+ for attempt in range(len(self.providers)):
260
+ try:
261
+ provider = self.providers[self.current_provider_index]
262
+
263
+ if provider == 'groq':
264
+ return await self._groq_response(prompt, max_tokens)
265
+ elif provider == 'openai':
266
+ return await self._openai_response(prompt, max_tokens)
267
+ elif provider == 'gemini':
268
+ return await self._gemini_response(prompt, max_tokens)
269
+
270
+ except Exception as e:
271
+ logger.warning(f"{provider} failed: {e}")
272
+ self.current_provider_index = (self.current_provider_index + 1) % len(self.providers)
273
+ continue
274
+
275
+ return "Error: All LLM providers failed"
276
+
277
+ async def _groq_response(self, prompt: str, max_tokens: int) -> str:
278
+ key = next(self.groq_keys)
279
+ client = groq.Groq(api_key=key)
280
+
281
+ response = client.chat.completions.create(
282
+ model="llama-3.3-70b-versatile",
283
+ messages=[{"role": "user", "content": prompt}],
284
+ temperature=0.1,
285
+ max_tokens=max_tokens,
286
+ top_p=0.9
287
+ )
288
+ return response.choices[0].message.content.strip()
289
+
290
+ async def _openai_response(self, prompt: str, max_tokens: int) -> str:
291
+ key = next(self.openai_keys)
292
+ openai.api_key = key
293
+
294
+ response = await openai.ChatCompletion.acreate(
295
+ model="gpt-4o-mini",
296
+ messages=[{"role": "user", "content": prompt}],
297
+ temperature=0.1,
298
+ max_tokens=max_tokens
299
+ )
300
+ return response.choices[0].message.content.strip()
301
+
302
+ async def _gemini_response(self, prompt: str, max_tokens: int) -> str:
303
+ key = next(self.gemini_keys)
304
+ genai.configure(api_key=key)
305
+
306
+ model = genai.GenerativeModel('gemini-pro')
307
+ response = await model.generate_content_async(prompt)
308
+ return response.text.strip()
309
+
310
+ # --- COMPLETE UNIVERSAL DOCUMENT PROCESSOR (ALL YOUR ORIGINAL FEATURES) ---
311
+ class UniversalDocumentProcessor:
312
+ def __init__(self):
313
+ # SPEED OPTIMIZATIONS: Reduced limits
314
+ self.chunk_size = 1000 # Reduced from 1200
315
+ self.chunk_overlap = 200
316
+ self.max_chunks = 200 # Kept at 200 (good balance)
317
+ self.max_pages = 18 # Reduced from 25
318
+
319
+ # Smaller cache for speed
320
+ self.cache = cachetools.TTLCache(maxsize=50, ttl=1800)
321
+
322
+ # Supported formats (KEEPING all your excellent processors)
323
+ self.processors = {
324
+ '.pdf': self.process_pdf,
325
+ '.docx': self.process_docx,
326
+ '.doc': self.process_doc,
327
+ '.xlsx': self.process_excel,
328
+ '.xls': self.process_excel,
329
+ '.csv': self.process_csv,
330
+ '.txt': self.process_text,
331
+ '.html': self.process_html,
332
+ '.xml': self.process_xml,
333
+ '.eml': self.process_email,
334
+ '.zip': self.process_archive,
335
+ '.json': self.process_json
336
+ }
337
+
338
+ logger.info("⚡ Speed-Optimized Universal Document Processor initialized")
339
+
340
+ def get_file_hash(self, content: bytes) -> str:
341
+ """Generate shorter hash for caching"""
342
+ return hashlib.md5(content).hexdigest()[:8]
343
+
344
+ async def process_document(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
345
+ """Process any document format with optimized caching"""
346
+ file_hash = self.get_file_hash(content)
347
+
348
+ # Check cache first
349
+ if file_hash in self.cache:
350
+ logger.info(f"📦 Cache hit for {os.path.basename(file_path)}")
351
+ return self.cache[file_hash]
352
+
353
+ # Detect file type
354
+ file_ext = Path(file_path).suffix.lower()
355
+ if not file_ext:
356
+ file_ext = self._detect_file_type(content)
357
+
358
+ # Process based on file type
359
+ processor = self.processors.get(file_ext, self.process_text)
360
+
361
+ try:
362
+ chunks = await processor(file_path, content)
363
+
364
+ # Cache the result
365
+ self.cache[file_hash] = chunks
366
+
367
+ logger.info(f"✅ Processed {os.path.basename(file_path)}: {len(chunks)} chunks")
368
+ return chunks
369
+
370
+ except Exception as e:
371
+ logger.error(f"❌ Processing failed for {file_path}: {e}")
372
+ return self._emergency_text_extraction(content, file_path)
373
+
374
+ def _detect_file_type(self, content: bytes) -> str:
375
+ """Detect file type from content"""
376
+ if content.startswith(b'%PDF'):
377
+ return '.pdf'
378
+ elif content.startswith(b'PK'):
379
+ return '.docx' if b'word/' in content[:1000] else '.zip'
380
+ elif content.startswith(b'<html') or content.startswith(b'<!DOCTYPE'):
381
+ return '.html'
382
+ elif content.startswith(b'<?xml'):
383
+ return '.xml'
384
+ else:
385
+ return '.txt'
386
+
387
+ # --- SPEED-OPTIMIZED PDF PROCESSING (YOUR COMPLETE ORIGINAL) ---
388
+ async def process_pdf(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
389
+ """Enhanced PDF processing with speed optimizations"""
390
+ chunks = []
391
+ temp_path = f"/tmp/{uuid.uuid4().hex[:6]}.pdf" # Shorter UUID
392
+
393
+ with open(temp_path, 'wb') as f:
394
+ f.write(content)
395
+
396
+ try:
397
+ # Extract text with PyMuPDF
398
+ doc = fitz.open(temp_path)
399
+ full_text = ""
400
+
401
+ # SPEED OPTIMIZATION: Process fewer pages
402
+ for page_num in range(min(len(doc), self.max_pages)):
403
+ page = doc[page_num]
404
+ text = page.get_text()
405
+
406
+ if text.strip():
407
+ full_text += f"\n\nPage {page_num + 1}:\n{self._clean_text(text)}"
408
+
409
+ doc.close()
410
+
411
+ # OPTIMIZED table extraction
412
+ table_text = await self._extract_pdf_tables_fast(temp_path)
413
+ if table_text:
414
+ full_text += f"\n\n=== TABLES ===\n{table_text}"
415
+
416
+ # Create semantic chunks
417
+ chunks = self._create_semantic_chunks(full_text, file_path, "pdf")
418
+
419
+ except Exception as e:
420
+ logger.error(f"PDF processing error: {e}")
421
+ chunks = self._emergency_text_extraction(content, file_path)
422
+
423
+ finally:
424
+ if os.path.exists(temp_path):
425
+ os.remove(temp_path)
426
+
427
+ return chunks
428
+
429
+ async def _extract_pdf_tables_fast(self, file_path: str) -> str:
430
+ """SPEED-OPTIMIZED table extraction"""
431
+ table_text = ""
432
+ try:
433
+ with pdfplumber.open(file_path) as pdf:
434
+ # SPEED OPTIMIZATION: Fewer pages and tables
435
+ for page_num, page in enumerate(pdf.pages[:10]): # Reduced from 12
436
+ tables = page.find_tables()
437
+ for i, table in enumerate(tables[:1]): # Only 1 table per page
438
+ try:
439
+ table_data = table.extract()
440
+ if table_data and len(table_data) > 1:
441
+ table_md = f"\n**Table {i+1} (Page {page_num+1})**\n"
442
+ for row in table_data[:12]: # Reduced from 15
443
+ if row:
444
+ clean_row = [str(cell or "").strip()[:30] for cell in row]
445
+ table_md += "| " + " | ".join(clean_row) + " |\n"
446
+ table_text += table_md + "\n"
447
+ except:
448
+ continue
449
+ except Exception as e:
450
+ logger.warning(f"Table extraction failed: {e}")
451
+
452
+ return table_text
453
+
454
+ # --- OTHER FORMAT PROCESSORS (ALL YOUR EXCELLENT FEATURES) ---
455
+ async def process_docx(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
456
+ """Process DOCX files"""
457
+ temp_path = f"/tmp/{uuid.uuid4().hex[:6]}.docx"
458
+ with open(temp_path, 'wb') as f:
459
+ f.write(content)
460
+
461
+ try:
462
+ doc = docx.Document(temp_path)
463
+ full_text = ""
464
+
465
+ # Extract paragraphs
466
+ for para in doc.paragraphs:
467
+ if para.text.strip():
468
+ full_text += para.text + "\n"
469
+
470
+ # Extract tables
471
+ for table in doc.tables:
472
+ table_text = "\n**TABLE**\n"
473
+ for row in table.rows:
474
+ row_text = []
475
+ for cell in row.cells:
476
+ row_text.append(cell.text.strip())
477
+ table_text += "| " + " | ".join(row_text) + " |\n"
478
+ full_text += table_text + "\n"
479
+
480
+ chunks = self._create_semantic_chunks(full_text, file_path, "docx")
481
+
482
+ except Exception as e:
483
+ logger.error(f"DOCX processing error: {e}")
484
+ chunks = self._emergency_text_extraction(content, file_path)
485
+
486
+ finally:
487
+ if os.path.exists(temp_path):
488
+ os.remove(temp_path)
489
+
490
+ return chunks
491
+
492
+ async def process_doc(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
493
+ """Process DOC files (fallback to text extraction)"""
494
+ return self._emergency_text_extraction(content, file_path)
495
+
496
+ async def process_excel(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
497
+ """Process Excel files"""
498
+ temp_path = f"/tmp/{uuid.uuid4().hex[:6]}.xlsx"
499
+ with open(temp_path, 'wb') as f:
500
+ f.write(content)
501
+
502
+ try:
503
+ workbook = openpyxl.load_workbook(temp_path, read_only=True)
504
+ full_text = ""
505
+
506
+ for sheet_name in workbook.sheetnames[:3]:
507
+ sheet = workbook[sheet_name]
508
+ full_text += f"\n**Sheet: {sheet_name}**\n"
509
+
510
+ for row_num, row in enumerate(sheet.iter_rows(max_row=50, values_only=True)):
511
+ if row_num == 0 or any(cell for cell in row):
512
+ row_text = [str(cell or "").strip()[:30] for cell in row[:8]]
513
+ full_text += "| " + " | ".join(row_text) + " |\n"
514
+
515
+ workbook.close()
516
+ chunks = self._create_semantic_chunks(full_text, file_path, "excel")
517
+
518
+ except Exception as e:
519
+ logger.error(f"Excel processing error: {e}")
520
+ chunks = self._emergency_text_extraction(content, file_path)
521
+
522
+ finally:
523
+ if os.path.exists(temp_path):
524
+ os.remove(temp_path)
525
+
526
+ return chunks
527
+
528
+ # --- Other format processors (keeping all your excellent features) ---
529
+ async def process_csv(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
530
+ try:
531
+ text_content = content.decode('utf-8', errors='ignore')
532
+ lines = text_content.split('\n')
533
+
534
+ full_text = "**CSV DATA**\n"
535
+ for i, line in enumerate(lines[:100]):
536
+ if line.strip():
537
+ full_text += f"| {line} |\n"
538
+
539
+ return self._create_semantic_chunks(full_text, file_path, "csv")
540
+ except Exception as e:
541
+ logger.error(f"CSV processing error: {e}")
542
+ return []
543
+
544
+ async def process_text(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
545
+ try:
546
+ text = content.decode('utf-8', errors='ignore')
547
+ return self._create_semantic_chunks(text, file_path, "text")
548
+ except Exception as e:
549
+ logger.error(f"Text processing error: {e}")
550
+ return []
551
+
552
+ async def process_html(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
553
+ try:
554
+ soup = BeautifulSoup(content, 'html.parser')
555
+ for script in soup(["script", "style"]):
556
+ script.decompose()
557
+ text = soup.get_text()
558
+ return self._create_semantic_chunks(text, file_path, "html")
559
+ except Exception as e:
560
+ logger.error(f"HTML processing error: {e}")
561
+ return []
562
+
563
+ async def process_xml(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
564
+ try:
565
+ root = ET.fromstring(content)
566
+ def extract_text(element, level=0):
567
+ text = ""
568
+ if element.text and element.text.strip():
569
+ text += f"{' ' * level}{element.tag}: {element.text.strip()}\n"
570
+ for child in element:
571
+ text += extract_text(child, level + 1)
572
+ return text
573
+ full_text = extract_text(root)
574
+ return self._create_semantic_chunks(full_text, file_path, "xml")
575
+ except Exception as e:
576
+ logger.error(f"XML processing error: {e}")
577
+ return []
578
+
579
+ async def process_email(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
580
+ try:
581
+ msg = email.message_from_bytes(content, policy=default)
582
+ full_text = f"**EMAIL**\n"
583
+ full_text += f"From: {msg.get('From', 'Unknown')}\n"
584
+ full_text += f"Subject: {msg.get('Subject', 'No Subject')}\n\n"
585
+
586
+ if msg.is_multipart():
587
+ for part in msg.walk():
588
+ if part.get_content_type() == "text/plain":
589
+ body = part.get_content()
590
+ full_text += f"Content:\n{body}\n"
591
+ else:
592
+ body = msg.get_content()
593
+ full_text += f"Content:\n{body}\n"
594
+
595
+ return self._create_semantic_chunks(full_text, file_path, "email")
596
+ except Exception as e:
597
+ logger.error(f"Email processing error: {e}")
598
+ return []
599
+
600
+ async def process_archive(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
601
+ temp_path = f"/tmp/{uuid.uuid4().hex[:6]}.zip"
602
+ with open(temp_path, 'wb') as f:
603
+ f.write(content)
604
+
605
+ chunks = []
606
+ try:
607
+ if file_path.endswith('.zip'):
608
+ with zipfile.ZipFile(temp_path, 'r') as zip_file:
609
+ for file_info in zip_file.filelist[:5]:
610
+ try:
611
+ file_content = zip_file.read(file_info)
612
+ sub_chunks = await self.process_document(file_info.filename, file_content)
613
+ chunks.extend(sub_chunks[:15]) # Limit sub-chunks for speed
614
+ except:
615
+ continue
616
+ except Exception as e:
617
+ logger.error(f"Archive processing error: {e}")
618
+
619
+ finally:
620
+ if os.path.exists(temp_path):
621
+ os.remove(temp_path)
622
+
623
+ return chunks
624
+
625
+ async def process_json(self, file_path: str, content: bytes) -> List[Dict[str, Any]]:
626
+ try:
627
+ data = json.loads(content.decode('utf-8'))
628
+ full_text = json.dumps(data, indent=2, ensure_ascii=False)
629
+ return self._create_semantic_chunks(full_text, file_path, "json")
630
+ except Exception as e:
631
+ logger.error(f"JSON processing error: {e}")
632
+ return []
633
+
634
+ # --- UTILITY METHODS (YOUR EXCELLENT ORIGINAL) ---
635
+ def _clean_text(self, text: str) -> str:
636
+ """Clean extracted text"""
637
+ # Remove excessive whitespace
638
+ text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
639
+ text = re.sub(r'\s+', ' ', text)
640
+
641
+ # Remove noise patterns
642
+ noise_patterns = [
643
+ r'Office of.*Insurance Ombudsman.*?\n',
644
+ r'Lalit Bhawan.*?\n',
645
+ r'^\d+\s*$'
646
+ ]
647
+
648
+ for pattern in noise_patterns:
649
+ text = re.sub(pattern, '', text, flags=re.MULTILINE)
650
+
651
+ return text.strip()
652
+
653
+ def _create_semantic_chunks(self, text: str, source: str, doc_type: str) -> List[Dict[str, Any]]:
654
+ """Create semantic chunks from text"""
655
+ text = self._clean_text(text)
656
+
657
+ if not text or len(text) < 50:
658
+ return []
659
+
660
+ # Smart sentence-based chunking
661
+ sentences = re.split(r'(?<=[.!?])\s+', text)
662
+ chunks = []
663
+ current_chunk = ""
664
+
665
+ for sentence in sentences:
666
+ if len(current_chunk) + len(sentence) <= self.chunk_size:
667
+ current_chunk += sentence + " "
668
+ else:
669
+ if current_chunk.strip():
670
+ chunks.append(current_chunk.strip())
671
+ current_chunk = sentence + " "
672
+
673
+ if current_chunk.strip():
674
+ chunks.append(current_chunk.strip())
675
+
676
+ # Convert to structured chunks
677
+ structured_chunks = []
678
+ for i, chunk_text in enumerate(chunks[:self.max_chunks]):
679
+ structured_chunks.append({
680
+ "content": chunk_text,
681
+ "metadata": {
682
+ "source": os.path.basename(source),
683
+ "chunk_index": i,
684
+ "document_type": doc_type,
685
+ "chunk_length": len(chunk_text)
686
+ },
687
+ "chunk_id": str(uuid.uuid4())
688
+ })
689
+
690
+ return structured_chunks
691
+
692
+ def _emergency_text_extraction(self, content: bytes, file_path: str) -> List[Dict[str, Any]]:
693
+ """Emergency text extraction for unsupported formats"""
694
+ try:
695
+ text = content.decode('utf-8', errors='ignore')
696
+ if len(text) > 50:
697
+ return self._create_semantic_chunks(text, file_path, "unknown")
698
+ except:
699
+ pass
700
+
701
+ return [{
702
+ "content": "Failed to extract content from document",
703
+ "metadata": {
704
+ "source": os.path.basename(file_path),
705
+ "chunk_index": 0,
706
+ "document_type": "error",
707
+ "error": True
708
+ },
709
+ "chunk_id": str(uuid.uuid4())
710
+ }]
711
+
712
+ # --- GEMINI'S FIX: DEADLOCK-FREE RAG PIPELINE ---
713
+ class DeadlockFreeRAGPipeline:
714
+ """FIXED: Direct embedding management - no more AsyncKaggleEmbeddingWrapper deadlock"""
715
+ def __init__(self, collection_name: str, llm_manager: MultiLLMManager, kaggle_client: LazyKaggleModelClient):
716
+ self.collection_name = collection_name
717
+ self.llm_manager = llm_manager
718
+ self.kaggle_client = kaggle_client
719
+ self.security_guard = SecurityGuard()
720
+ self.query_processor = LightweightQueryProcessor(kaggle_client)
721
+
722
+ # GEMINI'S FIX: No embedding function - let Chroma be a simple data store
723
+ self.vectorstore = Chroma(
724
+ collection_name=collection_name,
725
+ # REMOVED: embedding_function parameter completely
726
+ persist_directory="/tmp/chroma_kaggle"
727
+ )
728
+
729
+ logger.info(f"🚀 Deadlock-Free RAG Pipeline initialized: {collection_name}")
730
+
731
+ async def add_documents(self, chunks: List[Dict[str, Any]]):
732
+ """GEMINI'S FIX: Direct embedding management - no deadlock"""
733
+ if not chunks:
734
+ return
735
+
736
+ logger.info(f"📚 Processing {len(chunks)} chunks...")
737
+
738
+ # Advanced quality filtering (YOUR EXCELLENT ORIGINAL LOGIC)
739
+ quality_chunks = []
740
+ for chunk in chunks:
741
+ content = chunk['content']
742
+
743
+ # Skip error chunks
744
+ if chunk['metadata'].get('error'):
745
+ continue
746
+
747
+ # Quality assessment
748
+ quality_score = 0
749
+
750
+ # Length factor
751
+ if 100 <= len(content) <= 2000:
752
+ quality_score += 2
753
+ elif len(content) > 50:
754
+ quality_score += 1
755
+
756
+ # Content richness
757
+ sentences = len(re.split(r'[.!?]+', content))
758
+ if sentences > 3:
759
+ quality_score += 1
760
+
761
+ # Numerical data (good for policies)
762
+ numbers = len(re.findall(r'\d+', content))
763
+ if numbers > 0:
764
+ quality_score += 1
765
+
766
+ if quality_score >= 2:
767
+ quality_chunks.append(chunk)
768
+
769
+ logger.info(f"📚 Filtered to {len(quality_chunks)} quality chunks")
770
+
771
+ if not quality_chunks:
772
+ return
773
+
774
+ # GEMINI'S FIX: Step 1 - Get texts
775
+ texts = [chunk['content'] for chunk in quality_chunks[:100]] # Reduced from 150 for speed
776
+
777
+ # GEMINI'S FIX: Step 2 - Embed all texts via Kaggle (Manager gets sauce first)
778
+ logger.info(f"🚀 Embedding {len(texts)} chunks via Kaggle...")
779
+ embeddings = await self.kaggle_client.generate_embeddings(texts)
780
+
781
+ if not embeddings or len(embeddings) != len(texts):
782
+ logger.error("Embedding failed or returned mismatched count.")
783
+ return
784
+
785
+ # GEMINI'S FIX: Step 3 - Add to Chroma with pre-calculated embeddings
786
+ # This completely avoids the deadlock!
787
+ self.vectorstore.add_texts(
788
+ texts=texts,
789
+ metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]],
790
+ embeddings=embeddings # Pass vectors directly - no async calls in Chroma!
791
+ )
792
+
793
+ logger.info(f"✅ Added {len(texts)} documents with embeddings to vector store (DEADLOCK-FREE)")
794
+
795
+ async def answer_question(self, question: str) -> str:
796
+ """GEMINI'S FIX: Direct query embedding - no deadlock"""
797
+ # Security check
798
+ if self.security_guard.detect_jailbreak(question):
799
+ return self.security_guard.sanitize_response(question, "")
800
+
801
+ try:
802
+ # Enhanced query processing
803
+ enhanced_question = await self.query_processor.enhance_query_semantically(question)
804
+
805
+ # GEMINI'S FIX: Step 1 - Embed the query yourself first (Manager gets sauce)
806
+ query_embedding_list = await self.kaggle_client.generate_embeddings([enhanced_question])
807
+ if not query_embedding_list:
808
+ return "I could not process the query for searching."
809
+
810
+ query_embedding = query_embedding_list[0]
811
+
812
+ # GEMINI'S FIX: Step 2 - Search using vector directly (no async calls in Chroma)
813
+ relevant_docs = self.vectorstore.similarity_search_by_vector(
814
+ embedding=query_embedding,
815
+ k=15
816
+ )
817
+
818
+ if not relevant_docs:
819
+ return "I don't have sufficient information to answer this question based on the provided documents."
820
+
821
+ # Use Kaggle GPU for reranking (GAME CHANGER)
822
+ doc_contents = [doc.page_content for doc in relevant_docs]
823
+
824
+ if await self.kaggle_client.health_check():
825
+ logger.info("🎯 Using Kaggle GPU for reranking")
826
+ top_docs_content = await self.kaggle_client.rerank_documents(
827
+ enhanced_question, doc_contents, k=6
828
+ )
829
+ else:
830
+ logger.warning("📦 Kaggle unavailable, using first 6 docs")
831
+ top_docs_content = doc_contents[:6]
832
+
833
+ # Prepare enhanced context
834
+ context = "\n\n".join(top_docs_content)
835
+
836
+ # Create advanced semantic prompt
837
+ prompt = self._create_advanced_prompt(context, question)
838
+
839
+ # Get response from multi-LLM system
840
+ response = await self.llm_manager.get_response(prompt)
841
+
842
+ # Final security check and cleaning
843
+ response = self.security_guard.sanitize_response(question, response)
844
+ response = self._clean_response(response)
845
+
846
+ return response
847
+
848
+ except Exception as e:
849
+ logger.error(f"❌ Question processing failed: {e}")
850
+ return "An error occurred while processing your question."
851
+
852
+ def _create_advanced_prompt(self, context: str, question: str) -> str:
853
+ """Create advanced semantic-aware prompt (YOUR EXCELLENT ORIGINAL)"""
854
+ return f"""You are an expert insurance policy analyst with advanced semantic understanding.
855
+
856
+ CONTEXT ANALYSIS FRAMEWORK:
857
+ - Apply deep semantic understanding to connect related concepts across documents
858
+ - Recognize implicit relationships and cross-references within policy content
859
+ - Understand hierarchical information structures and conditional dependencies
860
+ - Synthesize information from multiple sources with semantic coherence
861
+
862
+ DOCUMENT CONTEXT:
863
+ {context}
864
+
865
+ QUESTION: {question}
866
+
867
+ ADVANCED REASONING APPROACH:
868
+ 1. SEMANTIC COMPREHENSION: Understand the full meaning and intent behind the question
869
+ 2. CONTEXTUAL MAPPING: Map question elements to semantically relevant sections
870
+ 3. RELATIONSHIP INFERENCE: Identify implicit connections between policy components
871
+ 4. MULTI-SOURCE SYNTHESIS: Combine information while maintaining semantic consistency
872
+ 5. CONDITIONAL REASONING: Apply logical reasoning to policy exceptions and conditions
873
+
874
+ RESPONSE REQUIREMENTS:
875
+ - Provide semantically rich, contextually grounded answers
876
+ - Include specific details: numbers, percentages, timeframes, conditions
877
+ - Write in clear, professional language without excessive quotes
878
+ - Address both explicit information and reasonable semantic inferences
879
+ - Structure information hierarchically when appropriate
880
+
881
+ ANSWER:"""
882
+
883
+ def _clean_response(self, response: str) -> str:
884
+ """Enhanced response cleaning (YOUR EXCELLENT ORIGINAL)"""
885
+ # Remove excessive quotes
886
+ response = re.sub(r'"([^"]{1,50})"', r'\1', response)
887
+ response = re.sub(r'"(\w+)"', r'\1', response)
888
+ response = re.sub(r'"(Rs\.?\s*[\d,]+[/-]*)"', r'\1', response)
889
+ response = re.sub(r'"(\d+%)"', r'\1', response)
890
+ response = re.sub(r'"(\d+\s*(?:days?|months?|years?))"', r'\1', response)
891
+
892
+ # Clean policy references
893
+ response = re.sub(r'[Aa]s stated in the policy[:\s]*"([^"]+)"', r'As per the policy, \1', response)
894
+ response = re.sub(r'[Aa]ccording to the policy[:\s]*"([^"]+)"', r'According to the policy, \1', response)
895
+ response = re.sub(r'[Tt]he policy states[:\s]*"([^"]+)"', r'The policy states that \1', response)
896
+
897
+ # Fix spacing and formatting
898
+ response = re.sub(r'\s+', ' ', response)
899
+ response = response.replace(' ,', ',')
900
+ response = response.replace(' .', '.')
901
+ response = re.sub(r'\n\s*\n\s*\n+', '\n\n', response)
902
+
903
+ return response.strip()
904
+
905
+ # --- AUTHENTICATION (YOUR EXCELLENT ORIGINAL) ---
906
+ async def verify_bearer_token(authorization: str = Header(None)):
907
+ """Enhanced authentication with better logging"""
908
+ if not authorization:
909
+ raise HTTPException(status_code=401, detail="Authorization header required")
910
+
911
+ if not authorization.startswith("Bearer "):
912
+ raise HTTPException(status_code=401, detail="Invalid authorization format")
913
+
914
+ token = authorization.replace("Bearer ", "")
915
+
916
+ if len(token) < 10:
917
+ raise HTTPException(status_code=401, detail="Invalid token format")
918
+
919
+ logger.info(f"✅ Authentication successful with token: {token[:10]}...")
920
+ return token
921
+
922
+ # --- GLOBAL INSTANCES (NO EARLY KAGGLE CONNECTION!) ---
923
+ multi_llm = MultiLLMManager()
924
+ doc_processor = UniversalDocumentProcessor()
925
+
926
+ # CRITICAL: Create lazy client (no immediate connection!)
927
+ kaggle_client = LazyKaggleModelClient()
928
+
929
+ # --- API MODELS ---
930
+ class SubmissionRequest(BaseModel):
931
+ documents: List[str]
932
+ questions: List[str]
933
+
934
+ class SubmissionResponse(BaseModel):
935
+ answers: List[str]
936
+
937
+ # --- FIXED: BOTH GET AND POST ENDPOINTS FOR /api/v1/hackrx/run ---
938
+ @app.get("/api/v1/hackrx/run")
939
+ def test_endpoint():
940
+ """GET endpoint for testing - fixes 405 Method Not Allowed error"""
941
+ return {
942
+ "message": "This endpoint requires POST method",
943
+ "usage": "Send POST request with documents and questions",
944
+ "status": "API is running - DEADLOCK-FREE with lazy initialization",
945
+ "kaggle_connection": "Will initialize on first request",
946
+ "fix": "Direct embedding management prevents async deadlocks",
947
+ "method": "Use POST with JSON body",
948
+ "example": {
949
+ "documents": ["url1", "url2"],
950
+ "questions": ["question1", "question2"]
951
+ }
952
+ }
953
+
954
+ # --- SPEED-OPTIMIZED MAIN ENDPOINT WITH GEMINI'S DEADLOCK FIX ---
955
+ @app.post("/api/v1/hackrx/run", response_model=SubmissionResponse, dependencies=[Depends(verify_bearer_token)])
956
+ async def run_submission(request: Request, submission_request: SubmissionRequest = Body(...)):
957
+ start_time = time.time()
958
+ logger.info(f"🎯 DEADLOCK-FREE KAGGLE-POWERED PROCESSING: {len(submission_request.documents)} docs, {len(submission_request.questions)} questions")
959
+
960
+ try:
961
+ # LAZY INITIALIZATION: Only now do we connect to Kaggle!
962
+ logger.info("🔄 Initializing Kaggle connection (lazy initialization)...")
963
+
964
+ # Check Kaggle health (this will trigger initialization)
965
+ if not await kaggle_client.health_check():
966
+ logger.error("❌ Kaggle endpoint not available!")
967
+ return SubmissionResponse(answers=[
968
+ "Model service unavailable" for _ in submission_request.questions
969
+ ])
970
+
971
+ # Create unique session with DEADLOCK-FREE pipeline
972
+ session_id = f"kaggle_{uuid.uuid4().hex[:6]}" # Shorter UUID
973
+ rag_pipeline = DeadlockFreeRAGPipeline(session_id, multi_llm, kaggle_client)
974
+
975
+ # Process all documents with higher concurrency
976
+ all_chunks = []
977
+
978
+ async with httpx.AsyncClient(
979
+ timeout=45.0,
980
+ headers={"ngrok-skip-browser-warning": "true"}
981
+ ) as client: # Tighter timeout + ngrok header
982
+ # SPEED OPTIMIZATION: Higher concurrency
983
+ semaphore = asyncio.Semaphore(5) # Increased from 3
984
+
985
+ async def process_single_document(doc_idx: int, doc_url: str):
986
+ async with semaphore:
987
+ try:
988
+ logger.info(f"📥 Downloading document {doc_idx + 1}")
989
+ response = await client.get(doc_url, follow_redirects=True)
990
+ response.raise_for_status()
991
+
992
+ # Get filename from URL or generate one
993
+ filename = os.path.basename(doc_url.split('?')[0]) or f"document_{doc_idx}"
994
+
995
+ # Process document with caching
996
+ chunks = await doc_processor.process_document(filename, response.content)
997
+
998
+ logger.info(f"✅ Document {doc_idx + 1}: {len(chunks)} chunks")
999
+ return chunks
1000
+
1001
+ except Exception as e:
1002
+ logger.error(f"❌ Document {doc_idx + 1} failed: {e}")
1003
+ return []
1004
+
1005
+ # Process all documents concurrently
1006
+ tasks = [
1007
+ process_single_document(i, url)
1008
+ for i, url in enumerate(submission_request.documents)
1009
+ ]
1010
+
1011
+ results = await asyncio.gather(*tasks)
1012
+
1013
+ # Flatten results
1014
+ for chunks in results:
1015
+ all_chunks.extend(chunks)
1016
+
1017
+ logger.info(f"📊 Total chunks processed: {len(all_chunks)}")
1018
+
1019
+ if not all_chunks:
1020
+ logger.error("❌ No valid content extracted!")
1021
+ return SubmissionResponse(answers=[
1022
+ "No valid content could be extracted from the provided documents."
1023
+ for _ in submission_request.questions
1024
+ ])
1025
+
1026
+ # Add to RAG pipeline with DEADLOCK-FREE processing
1027
+ await rag_pipeline.add_documents(all_chunks)
1028
+
1029
+ # SPEED OPTIMIZATION: Full parallel question answering
1030
+ logger.info(f"⚡ Answering questions in parallel...")
1031
+
1032
+ # INCREASED concurrency for questions
1033
+ semaphore = asyncio.Semaphore(4) # Increased from 2
1034
+
1035
+ async def answer_single_question(question: str) -> str:
1036
+ async with semaphore:
1037
+ return await rag_pipeline.answer_question(question)
1038
+
1039
+ tasks = [answer_single_question(q) for q in submission_request.questions]
1040
+ answers = await asyncio.gather(*tasks)
1041
+
1042
+ elapsed = time.time() - start_time
1043
+ logger.info(f"🎉 DEADLOCK-FREE KAGGLE-POWERED SUCCESS! Processed in {elapsed:.2f}s")
1044
+
1045
+ return SubmissionResponse(answers=answers)
1046
+
1047
+ except Exception as e:
1048
+ elapsed = time.time() - start_time
1049
+ logger.error(f"💥 CRITICAL ERROR after {elapsed:.2f}s: {e}")
1050
+
1051
+ return SubmissionResponse(answers=[
1052
+ "Processing error occurred. Please try again."
1053
+ for _ in submission_request.questions
1054
+ ])
1055
+
1056
+ # --- HEALTH ENDPOINTS (YOUR EXCELLENT ORIGINAL + DEADLOCK-FREE INFO) ---
1057
+ @app.get("/")
1058
+ def read_root():
1059
+ return {
1060
+ "message": "🎯 KAGGLE-POWERED HACKATHON RAG SYSTEM - DEADLOCK-FREE COMPLETE VERSION",
1061
+ "version": "5.4.0",
1062
+ "status": "FIXED: Deadlock-free + lazy initialization prevents all issues!",
1063
+ "target_time": "<20 seconds with Kaggle GPU",
1064
+ "supported_formats": list(doc_processor.processors.keys()),
1065
+ "features": [
1066
+ "Multi-format document processing (PDF, DOCX, Excel, CSV, HTML, etc.)",
1067
+ "Kaggle GPU-powered embeddings and reranking",
1068
+ "Multi-LLM fallback system (Groq, OpenAI, Gemini)",
1069
+ "Advanced semantic query enhancement",
1070
+ "Anti-jailbreak security system",
1071
+ "Optimized caching and concurrent processing",
1072
+ "Semantic chunking and context fusion",
1073
+ "R4 'half questions' handling",
1074
+ "Lightning-fast GPU-accelerated response times",
1075
+ "DEADLOCK-FREE async operations",
1076
+ "Lazy initialization prevents startup timeouts",
1077
+ "Direct embedding management"
1078
+ ],
1079
+ "kaggle_connection": "Lazy (connects on first API call)",
1080
+ "embedding_method": "Direct Kaggle management (no wrapper deadlock)",
1081
+ "fixes": [
1082
+ "DeadlockFreeRAGPipeline prevents async conflicts",
1083
+ "LazyKaggleModelClient prevents startup connection",
1084
+ "Direct embedding calls to Kaggle (no AsyncWrapper)",
1085
+ "Chroma as simple data store (no embedding function)",
1086
+ "CORS headers with ngrok-skip-browser-warning",
1087
+ "Both GET and POST endpoints for /api/v1/hackrx/run",
1088
+ "Improved error handling and logging",
1089
+ "Hugging Face Secrets support for dynamic URLs"
1090
+ ]
1091
+ }
1092
+
1093
+ @app.get("/health")
1094
+ def health_check():
1095
+ return {
1096
+ "status": "healthy",
1097
+ "version": "5.4.0",
1098
+ "mode": "DEADLOCK_FREE_KAGGLE_GPU_POWERED_LAZY",
1099
+ "cache_size": len(doc_processor.cache),
1100
+ "kaggle_connection": "lazy (on-demand)",
1101
+ "embedding_method": "direct_kaggle_management",
1102
+ "timestamp": time.time(),
1103
+ "fixes_applied": [
1104
+ "deadlock_free_pipeline",
1105
+ "lazy_initialization",
1106
+ "direct_embedding_management",
1107
+ "ngrok_compatibility",
1108
+ "http_method_fix",
1109
+ "cors_headers",
1110
+ "hf_secrets_support"
1111
+ ]
1112
+ }
1113
+
1114
+ @app.get("/test-kaggle")
1115
+ async def test_kaggle_connection():
1116
+ """Test endpoint to check Kaggle connection (will trigger lazy initialization)"""
1117
+ try:
1118
+ is_healthy = await kaggle_client.health_check()
1119
+ return {
1120
+ "kaggle_connection": "initialized" if kaggle_client._initialized else "not_initialized",
1121
+ "health_status": "healthy" if is_healthy else "unhealthy",
1122
+ "endpoint": kaggle_client._endpoint if kaggle_client._initialized else "not_set",
1123
+ "timestamp": time.time()
1124
+ }
1125
+ except Exception as e:
1126
+ return {
1127
+ "kaggle_connection": "failed",
1128
+ "health_status": "error",
1129
+ "error": str(e),
1130
+ "timestamp": time.time()
1131
+ }
1132
+
1133
+ # --- RUN SERVER ---
1134
+ if __name__ == "__main__":
1135
+ import uvicorn
1136
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt CHANGED
@@ -1,55 +1,3 @@
1
- # Fixed requirements.txt for Standalone RAG System
2
-
3
- # Core FastAPI dependencies
4
- fastapi==0.104.1
5
- uvicorn==0.24.0
6
- pydantic==2.5.1
7
- httpx==0.25.2
8
- python-dotenv==1.0.0
9
- psutil==5.9.6
10
- python-multipart==0.0.6
11
-
12
- # Document processing
13
- PyMuPDF==1.23.8
14
- pdfplumber==0.10.3
15
- mammoth==1.6.0
16
- beautifulsoup4==4.12.2
17
-
18
- # LangChain framework (compatible versions)
19
- langchain==0.1.20
20
- langchain-community==0.0.38
21
- langchain-core==0.1.52
22
-
23
- # Vector database and embeddings
24
- chromadb==0.4.18
25
- sentence-transformers==2.2.2
26
-
27
- # HuggingFace integration
28
- huggingface-hub==0.19.4
29
- transformers==4.36.2
30
-
31
- # LLM Integration
32
- groq==0.4.1
33
-
34
- # Core ML and scientific computing
35
- numpy==1.24.3
36
- scipy==1.11.4
37
- scikit-learn==1.3.2
38
-
39
- # Text processing
40
- tiktoken==0.5.2
41
-
42
- # Additional utilities
43
- python-Levenshtein==0.23.0
44
- python-magic==0.4.27
45
-
46
- # Core dependencies that might be missing
47
- typing-extensions==4.8.0
48
- requests==2.31.0
49
- certifi==2023.11.17
50
-
51
- openai
52
- python-docx
53
- google-generativeai
54
- openpyxl
55
- rarfile
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements_backup.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fixed requirements.txt for Standalone RAG System
2
+
3
+ # Core FastAPI dependencies
4
+ fastapi==0.104.1
5
+ uvicorn==0.24.0
6
+ pydantic==2.5.1
7
+ httpx==0.25.2
8
+ python-dotenv==1.0.0
9
+ psutil==5.9.6
10
+ python-multipart==0.0.6
11
+
12
+ # Document processing
13
+ PyMuPDF==1.23.8
14
+ pdfplumber==0.10.3
15
+ mammoth==1.6.0
16
+ beautifulsoup4==4.12.2
17
+
18
+ # LangChain framework (compatible versions)
19
+ langchain==0.1.20
20
+ langchain-community==0.0.38
21
+ langchain-core==0.1.52
22
+
23
+ # Vector database and embeddings
24
+ chromadb==0.4.18
25
+ sentence-transformers==2.2.2
26
+
27
+ # HuggingFace integration
28
+ huggingface-hub==0.19.4
29
+ transformers==4.36.2
30
+
31
+ # LLM Integration
32
+ groq==0.4.1
33
+
34
+ # Core ML and scientific computing
35
+ numpy==1.24.3
36
+ scipy==1.11.4
37
+ scikit-learn==1.3.2
38
+
39
+ # Text processing
40
+ tiktoken==0.5.2
41
+
42
+ # Additional utilities
43
+ python-Levenshtein==0.23.0
44
+ python-magic==0.4.27
45
+
46
+ # Core dependencies that might be missing
47
+ typing-extensions==4.8.0
48
+ requests==2.31.0
49
+ certifi==2023.11.17
50
+
51
+ openai
52
+ python-docx
53
+ google-generativeai
54
+ openpyxl
55
+ rarfile