abhi2400 commited on
Commit
20ad7d8
Β·
verified Β·
1 Parent(s): f600a89

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +805 -805
main.py CHANGED
@@ -1,806 +1,806 @@
1
- import os
2
- import logging
3
- import requests
4
- import fitz # PyMuPDF
5
- import google.generativeai as genai
6
- from fastapi import FastAPI, HTTPException
7
- from pydantic import BaseModel, validator
8
- from typing import List, Dict
9
- import re
10
- import asyncio
11
- from concurrent.futures import ThreadPoolExecutor
12
- import time
13
- import numpy as np
14
- from sentence_transformers import SentenceTransformer
15
- import hashlib
16
- from pinecone import Pinecone, ServerlessSpec
17
-
18
- # Configure logging
19
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
- logger = logging.getLogger(_name_)
21
-
22
- # Initialize FastAPI
23
- app = FastAPI(title="Debug Document QA API", version="5.1.0")
24
-
25
- # Configuration
26
- GEMINI_API_KEY = "AIzaSyBPa-4UMLTi81OgKUhTBuqczGzaKec4zP4"
27
- PINECONE_API_KEY = "pcsk_7M5Zsf_84MeAJ4hBxCMN5z4AT3gkNNnTqqicAzA5A6o5m9XViUkCFRTjsk46FVc6mKiynD"
28
- INDEX_NAME = "qa-fast-v2"
29
-
30
- # Initialize services
31
- genai.configure(api_key=GEMINI_API_KEY)
32
- model = genai.GenerativeModel('gemini-2.0-flash')
33
-
34
- # Lightweight embedding model
35
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
36
- embedding_model.max_seq_length = 256
37
-
38
- # Initialize Pinecone
39
- try:
40
- pc = Pinecone(api_key=PINECONE_API_KEY)
41
-
42
- # Create index if doesn't exist
43
- if INDEX_NAME not in pc.list_indexes().names():
44
- pc.create_index(
45
- name=INDEX_NAME,
46
- dimension=384,
47
- metric='cosine',
48
- spec=ServerlessSpec(cloud='aws', region='us-east-1')
49
- )
50
- time.sleep(5)
51
-
52
- index = pc.Index(INDEX_NAME)
53
- logger.info("βœ… Pinecone connected successfully")
54
- except Exception as e:
55
- logger.error(f"❌ Pinecone failed: {e}")
56
- index = None
57
-
58
- executor = ThreadPoolExecutor(max_workers=4)
59
-
60
- # Models
61
- class QARequest(BaseModel):
62
- documents: str
63
- questions: List[str]
64
-
65
- @validator('documents')
66
- def validate_url(cls, v):
67
- # Remove query parameters for extension check
68
- base_url = v.split('?')[0]
69
- if not base_url.lower().endswith('.pdf'):
70
- raise ValueError('Must be PDF URL')
71
- return v
72
-
73
- class QAResponse(BaseModel):
74
- answers: List[str]
75
-
76
- # Document processor
77
- class DocumentProcessor:
78
- def _init_(self):
79
- self.cache = {}
80
-
81
- def download_pdf(self, url: str) -> bytes:
82
- """Internal PDF download method with better error handling"""
83
- try:
84
- logger.info(f"πŸ“₯ Downloading PDF from: {url}")
85
-
86
- headers = {
87
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
88
- 'Accept': 'application/pdf,application/octet-stream,/'
89
- }
90
-
91
- response = requests.get(url, headers=headers, timeout=30, stream=True)
92
- response.raise_for_status()
93
-
94
- content = response.content
95
- logger.info(f"πŸ“„ Downloaded {len(content)} bytes")
96
- return content
97
-
98
- except Exception as e:
99
- logger.error(f"❌ Failed to download PDF: {str(e)}")
100
- raise HTTPException(status_code=400, detail=f"Failed to download PDF: {str(e)}")
101
-
102
- def extract_text(self, pdf_bytes: bytes) -> str:
103
- """Extract text with better debugging"""
104
- try:
105
- logger.info(f"πŸ“– Extracting text from {len(pdf_bytes)} bytes PDF")
106
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
107
-
108
- text_parts = []
109
- max_pages = min(doc.page_count, 50) # Process more pages
110
-
111
- for page_num in range(max_pages):
112
- page = doc.load_page(page_num)
113
- text = page.get_text()
114
- if text.strip():
115
- text_parts.append(text)
116
- logger.info(f"Page {page_num + 1}: {len(text)} characters")
117
-
118
- doc.close()
119
-
120
- # Combine and clean
121
- full_text = ' '.join(text_parts)
122
- full_text = re.sub(r'\s+', ' ', full_text.strip())
123
- full_text = re.sub(r'Page \d+', '', full_text, flags=re.IGNORECASE)
124
-
125
- logger.info(f"πŸ“„ Total extracted text: {len(full_text)} characters from {max_pages} pages")
126
-
127
- # Log sample text for debugging
128
- sample_text = full_text[:500] if len(full_text) > 500 else full_text
129
- logger.info(f"πŸ“ Sample text: {sample_text}...")
130
-
131
- return full_text
132
-
133
- except Exception as e:
134
- logger.error(f"❌ Text extraction failed: {e}")
135
- raise HTTPException(status_code=500, detail=f"Cannot extract text: {e}")
136
-
137
- def create_chunks(self, text: str) -> List[Dict]:
138
- """Create chunks with better debugging"""
139
- logger.info(f"πŸ”ͺ Creating chunks from {len(text)} characters")
140
-
141
- chunk_size = 1000 # Smaller chunks for better matching
142
- overlap = 150
143
-
144
- # Split into sentences
145
- sentences = re.split(r'(?<=[.!?])\s+', text)
146
- logger.info(f"πŸ“‹ Found {len(sentences)} sentences")
147
-
148
- chunks = []
149
- current_chunk = []
150
- current_length = 0
151
-
152
- for sentence in sentences:
153
- sentence = sentence.strip()
154
- if len(sentence) < 15: # Skip very short sentences
155
- continue
156
-
157
- if current_length + len(sentence) > chunk_size and current_chunk:
158
- # Create chunk
159
- chunk_text = ' '.join(current_chunk)
160
- chunks.append({
161
- 'text': chunk_text,
162
- 'id': f"chunk_{len(chunks)}"
163
- })
164
-
165
- # Keep some overlap
166
- if len(current_chunk) > 1:
167
- current_chunk = current_chunk[-1:] + [sentence]
168
- current_length = sum(len(s) for s in current_chunk)
169
- else:
170
- current_chunk = [sentence]
171
- current_length = len(sentence)
172
- else:
173
- current_chunk.append(sentence)
174
- current_length += len(sentence)
175
-
176
- # Add final chunk
177
- if current_chunk:
178
- chunk_text = ' '.join(current_chunk)
179
- chunks.append({
180
- 'text': chunk_text,
181
- 'id': f"chunk_{len(chunks)}"
182
- })
183
-
184
- logger.info(f"πŸ“Š Created {len(chunks)} chunks")
185
-
186
- # Log sample chunks for debugging
187
- for i, chunk in enumerate(chunks[:3]):
188
- logger.info(f"Chunk {i}: {chunk['text'][:100]}...")
189
-
190
- return chunks
191
-
192
- async def store_in_pinecone(self, chunks: List[Dict], doc_id: str):
193
- """Store chunks in Pinecone with debugging"""
194
- if not index:
195
- logger.error("❌ Pinecone not available - storing chunks in memory fallback")
196
- # Store in memory as fallback
197
- self.cache[doc_id] = chunks
198
- return f"memory_{doc_id}"
199
-
200
- try:
201
- namespace = f"doc_{doc_id}"
202
- logger.info(f"πŸ’Ύ Storing {len(chunks)} chunks in Pinecone namespace: {namespace}")
203
-
204
- # Create embeddings in batch
205
- texts = [chunk['text'] for chunk in chunks]
206
- logger.info("🧠 Creating embeddings...")
207
- embeddings = embedding_model.encode(texts, batch_size=16, show_progress_bar=False)
208
- logger.info(f"βœ… Created {len(embeddings)} embeddings, dimension: {len(embeddings[0])}")
209
-
210
- # Prepare vectors
211
- vectors = []
212
- for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
213
- vectors.append({
214
- 'id': f"{doc_id}_{chunk['id']}",
215
- 'values': embedding.tolist(),
216
- 'metadata': {
217
- 'text': chunk['text'][:1000], # Limit metadata size
218
- 'chunk_id': chunk['id']
219
- }
220
- })
221
-
222
- # Upsert in batches
223
- batch_size = 50
224
- for i in range(0, len(vectors), batch_size):
225
- batch = vectors[i:i + batch_size]
226
- result = index.upsert(vectors=batch, namespace=namespace)
227
- logger.info(f"πŸ“€ Upserted batch {i//batch_size + 1}: {result}")
228
-
229
- # Verify storage
230
- time.sleep(2) # Wait for indexing
231
- stats = index.describe_index_stats()
232
- logger.info(f"πŸ“Š Index stats: {stats}")
233
-
234
- return namespace
235
-
236
- except Exception as e:
237
- logger.error(f"❌ Pinecone storage failed: {e}")
238
- # Fallback to memory storage
239
- self.cache[doc_id] = chunks
240
- return f"memory_{doc_id}"
241
-
242
- # QA Processor
243
- class QAProcessor:
244
- def _init_(self):
245
- self.answer_cache = {}
246
-
247
- async def search_context(self, question: str, namespace: str, doc_processor: DocumentProcessor) -> List[str]:
248
- """Enhanced context search with memory fallback"""
249
- logger.info(f"πŸ” Searching for: '{question}' in namespace: {namespace}")
250
-
251
- # Check if using memory fallback
252
- if namespace.startswith("memory_"):
253
- doc_id = namespace.replace("memory_", "")
254
- if doc_id in doc_processor.cache:
255
- chunks = doc_processor.cache[doc_id]
256
- logger.info(f"🧠 Using memory fallback with {len(chunks)} chunks")
257
-
258
- # Simple keyword matching for memory fallback
259
- question_words = set(question.lower().split())
260
- scored_chunks = []
261
-
262
- for chunk in chunks:
263
- chunk_words = set(chunk['text'].lower().split())
264
- overlap = len(question_words.intersection(chunk_words))
265
- if overlap > 0:
266
- scored_chunks.append((chunk['text'], overlap))
267
-
268
- scored_chunks.sort(key=lambda x: x[1], reverse=True)
269
- contexts = [chunk for chunk, _ in scored_chunks[:8]]
270
- logger.info(f"πŸ“‹ Found {len(contexts)} relevant chunks via memory search")
271
- return contexts
272
-
273
- if not index:
274
- logger.error("❌ Both Pinecone and memory fallback failed")
275
- return []
276
-
277
- try:
278
- # Create query embedding
279
- logger.info("🧠 Creating query embedding...")
280
- query_embedding = embedding_model.encode([question])[0]
281
- logger.info(f"βœ… Query embedding created: dimension {len(query_embedding)}")
282
-
283
- # Search in Pinecone
284
- logger.info(f"πŸ” Querying Pinecone in namespace: {namespace}")
285
- results = index.query(
286
- vector=query_embedding.tolist(),
287
- top_k=15,
288
- namespace=namespace,
289
- include_metadata=True
290
- )
291
-
292
- logger.info(f"πŸ“Š Pinecone returned {len(results.matches)} matches")
293
-
294
- # Log match scores for debugging
295
- for i, match in enumerate(results.matches[:5]):
296
- logger.info(f"Match {i}: score={match.score:.4f}, text={match.metadata['text'][:100]}...")
297
-
298
- # Collect contexts with lower threshold
299
- contexts = []
300
- for match in results.matches:
301
- if match.score > 0.1: # Lower threshold
302
- contexts.append(match.metadata['text'])
303
-
304
- logger.info(f"πŸ“‹ Selected {len(contexts)} contexts above threshold")
305
- return contexts
306
-
307
- except Exception as e:
308
- logger.error(f"❌ Search failed: {e}")
309
- return []
310
-
311
- async def generate_answer(self, question: str, contexts: List[str]) -> str:
312
- """Generate concise, accurate answers"""
313
- logger.info(f"πŸ€– Generating answer for: '{question}' with {len(contexts)} contexts")
314
-
315
- if not contexts:
316
- logger.warning("⚠ No contexts found - trying direct text search")
317
- return "Answer not found in document."
318
-
319
- # Combine contexts intelligently
320
- combined_context = '\n'.join(contexts[:8])
321
- if len(combined_context) > 5000:
322
- combined_context = combined_context[:5000]
323
-
324
- logger.info(f"πŸ“ Combined context length: {len(combined_context)}")
325
-
326
- # Enhanced prompt for better extraction
327
- prompt = f"""You are an expert at extracting specific information from insurance policy documents.
328
-
329
- Your task: Find the exact answer to the question from the policy document below. Be concise and specific.
330
-
331
- Insurance Policy Document:
332
- {combined_context}
333
-
334
- Question: {question}
335
-
336
- Instructions:
337
- - Give a direct, concise answer
338
- - Include specific numbers, periods, percentages when mentioned
339
- - If there are conditions, mention the key ones briefly
340
- - Don't start with "Based on" or "According to"
341
- - Keep answer under 100 words
342
- - If no relevant information exists, say "Answer not found in document"
343
-
344
- Answer:"""
345
-
346
- try:
347
- logger.info("πŸ€– Calling Gemini...")
348
- response = await asyncio.wait_for(
349
- asyncio.get_event_loop().run_in_executor(
350
- executor,
351
- lambda: model.generate_content(
352
- prompt,
353
- generation_config=genai.types.GenerationConfig(
354
- temperature=0.05, # Very low for consistency
355
- max_output_tokens=150, # Shorter answers
356
- candidate_count=1,
357
- top_p=0.9
358
- )
359
- )
360
- ),
361
- timeout=15.0
362
- )
363
-
364
- answer = response.text.strip()
365
- logger.info(f"βœ… Generated answer: {answer[:100]}...")
366
-
367
- # Clean answer
368
- answer = self.clean_answer(answer)
369
-
370
- # Enhanced validation
371
- if self.is_valid_answer(answer, question):
372
- return answer
373
-
374
- # Try rule-based extraction for specific patterns
375
- logger.info("πŸ›  Trying enhanced rule-based extraction...")
376
- rule_answer = self.enhanced_rule_extraction(question, combined_context)
377
- if rule_answer != "Answer not found in document.":
378
- return rule_answer
379
-
380
- # Final fallback - return AI answer if it's not completely empty
381
- if answer and len(answer) > 10 and "not found" not in answer.lower():
382
- return answer
383
-
384
- return "Answer not found in document."
385
-
386
- except Exception as e:
387
- logger.error(f"❌ Generation failed: {e}")
388
- return self.enhanced_rule_extraction(question, combined_context)
389
-
390
- def clean_answer(self, answer: str) -> str:
391
- """Clean and format answer"""
392
- if not answer:
393
- return ""
394
-
395
- # Remove common prefixes
396
- prefixes = [
397
- "ANSWER:", "Based on the", "According to", "The context",
398
- "The document", "From the policy", "Answer:"
399
- ]
400
-
401
- for prefix in prefixes:
402
- if answer.startswith(prefix):
403
- answer = answer[len(prefix):].strip()
404
- if answer.startswith(':'):
405
- answer = answer[1:].strip()
406
-
407
- # Clean formatting
408
- answer = re.sub(r'\s+', ' ', answer.strip())
409
-
410
- # Capitalize first letter
411
- if answer and answer[0].islower():
412
- answer = answer[0].upper() + answer[1:]
413
-
414
- return answer
415
-
416
- def is_valid_answer(self, answer: str, question: str) -> bool:
417
- """Check if answer is valid and relevant"""
418
- if not answer or len(answer) < 5:
419
- return False
420
-
421
- # Check for non-answers
422
- non_answers = [
423
- "answer not found", "not mentioned", "does not contain",
424
- "no information", "cannot be determined", "not specified"
425
- ]
426
-
427
- if any(phrase in answer.lower() for phrase in non_answers):
428
- return False
429
-
430
- # Check if answer contains relevant keywords from question
431
- question_words = set(question.lower().split())
432
- answer_words = set(answer.lower().split())
433
-
434
- # Should have some overlap
435
- overlap = len(question_words.intersection(answer_words))
436
- return overlap >= 1
437
-
438
- def enhanced_rule_extraction(self, question: str, context: str) -> str:
439
- """Enhanced rule-based extraction for insurance-specific queries"""
440
- logger.info(f"πŸ›  Enhanced rule extraction for: '{question}'")
441
-
442
- q_lower = question.lower()
443
-
444
- # Define comprehensive patterns for insurance terms
445
- insurance_patterns = {
446
- 'grace period': {
447
- 'patterns': [
448
- r'grace period.?(\d+)\s(days?|months?)',
449
- r'(\d+)\s*days?\s*grace\s*period',
450
- r'premium.?grace.?(\d+)\s*days?',
451
- r'grace.*?(\d+)\s*days?'
452
- ],
453
- 'extract_sentence': True
454
- },
455
- 'waiting period': {
456
- 'patterns': [
457
- r'waiting period.?(\d+)\s(days?|months?|years?)',
458
- r'(\d+)\s*(days?|months?|years?).*?waiting\s*period',
459
- r'pre.?existing.?(\d+)\s*(months?|years?)',
460
- r'(\d+)\s*months?.*?continuous\s*coverage'
461
- ],
462
- 'extract_sentence': True
463
- },
464
- 'maternity': {
465
- 'patterns': [
466
- r'maternity.?(\d+)\s(months?|years?)',
467
- r'(\d+)\s*months?.*?maternity',
468
- r'pregnancy.*?(\d+)\s*months?',
469
- r'childbirth.*?(\d+)\s*months?',
470
- r'continuous.?covered.?(\d+)\s*months?'
471
- ],
472
- 'extract_full': True
473
- },
474
- 'cataract': {
475
- 'patterns': [
476
- r'cataract.?(\d+)\s(years?|months?)',
477
- r'(\d+)\s*years?.*?cataract',
478
- r'eye.?surgery.?(\d+)\s*years?',
479
- r'cataract.?waiting.?(\d+)'
480
- ],
481
- 'extract_sentence': True
482
- },
483
- 'ncd|no claim discount': {
484
- 'patterns': [
485
- r'no claim discount.*?(\d+)%',
486
- r'ncd.*?(\d+)%',
487
- r'(\d+)%.*?no claim',
488
- r'cumulative bonus.*?(\d+)%',
489
- r'(\d+)%.?claim.?free'
490
- ],
491
- 'extract_sentence': True
492
- },
493
- 'room rent|icu': {
494
- 'patterns': [
495
- r'room rent.*?(\d+)%',
496
- r'icu.*?(\d+)%',
497
- r'(\d+)%.*?room rent',
498
- r'(\d+)%.*?sum insured'
499
- ],
500
- 'extract_sentence': True
501
- },
502
- 'ayush': {
503
- 'patterns': [
504
- r'ayurveda.?yoga.?naturopathy',
505
- r'ayush.*?hospital',
506
- r'unani.?siddha.?homeopathy'
507
- ],
508
- 'extract_full': True
509
- },
510
- 'hospital': {
511
- 'patterns': [
512
- r'hospital.?means.?institution',
513
- r'(\d+).*?inpatient beds',
514
- r'qualified nursing staff'
515
- ],
516
- 'extract_full': True
517
- }
518
- }
519
-
520
- # Find relevant pattern category
521
- for key, config in insurance_patterns.items():
522
- if any(word in q_lower for word in key.split('|')):
523
- logger.info(f"πŸ” Checking patterns for: {key}")
524
-
525
- for pattern in config['patterns']:
526
- matches = list(re.finditer(pattern, context, re.IGNORECASE))
527
- if matches:
528
- logger.info(f"βœ… Pattern matched: {pattern}")
529
-
530
- # Extract based on configuration
531
- if config.get('extract_full'):
532
- # Extract larger context around match
533
- match = matches[0]
534
- start = max(0, match.start() - 200)
535
- end = min(len(context), match.end() + 200)
536
- full_context = context[start:end]
537
-
538
- # Find complete sentences
539
- sentences = re.split(r'[.!?]+', full_context)
540
- relevant_sentences = []
541
-
542
- for sentence in sentences:
543
- if (re.search(pattern, sentence, re.IGNORECASE) or
544
- any(word in sentence.lower() for word in key.split('|'))):
545
- relevant_sentences.append(sentence.strip())
546
-
547
- if relevant_sentences:
548
- result = '. '.join(relevant_sentences[:2])
549
- return self.clean_extracted_answer(result)
550
-
551
- else: # extract_sentence
552
- # Find the sentence containing the match
553
- match = matches[0]
554
- # Expand search area
555
- start = max(0, match.start() - 150)
556
- end = min(len(context), match.end() + 150)
557
- sentence_area = context[start:end]
558
-
559
- sentences = re.split(r'[.!?]+', sentence_area)
560
- for sentence in sentences:
561
- if re.search(pattern, sentence, re.IGNORECASE) and len(sentence.strip()) > 15:
562
- result = sentence.strip()
563
- return self.clean_extracted_answer(result)
564
-
565
- # Fallback: keyword-based extraction
566
- return self.keyword_based_extraction(question, context)
567
-
568
- def keyword_based_extraction(self, question: str, context: str) -> str:
569
- """Extract answer based on keyword matching"""
570
- question_keywords = [word.lower() for word in question.split() if len(word) > 3]
571
-
572
- if not question_keywords:
573
- return "Answer not found in document."
574
-
575
- sentences = re.split(r'[.!?]+', context)
576
- scored_sentences = []
577
-
578
- for sentence in sentences:
579
- sentence = sentence.strip()
580
- if len(sentence) < 20:
581
- continue
582
-
583
- sentence_lower = sentence.lower()
584
- score = 0
585
-
586
- # Count keyword matches
587
- for keyword in question_keywords:
588
- if keyword in sentence_lower:
589
- score += 1
590
-
591
- # Bonus for numbers (common in insurance)
592
- if re.search(r'\d+', sentence):
593
- score += 0.5
594
-
595
- # Bonus for insurance terms
596
- insurance_terms = ['policy', 'coverage', 'benefit', 'premium', 'claim', 'period', 'limit']
597
- for term in insurance_terms:
598
- if term in sentence_lower:
599
- score += 0.3
600
-
601
- if score >= 1.5: # Threshold for relevance
602
- scored_sentences.append((sentence, score))
603
-
604
- if scored_sentences:
605
- # Sort by score and return best match
606
- scored_sentences.sort(key=lambda x: x[1], reverse=True)
607
- best_sentence = scored_sentences[0][0]
608
- return self.clean_extracted_answer(best_sentence)
609
-
610
- return "Answer not found in document."
611
-
612
- def clean_extracted_answer(self, answer: str) -> str:
613
- """Clean extracted answers"""
614
- if not answer:
615
- return ""
616
-
617
- # Remove common prefixes and suffixes
618
- prefixes_to_remove = [
619
- "however,", "therefore,", "moreover,", "furthermore,",
620
- "in addition,", "also,", "but,", "and,"
621
- ]
622
-
623
- answer = answer.strip()
624
- for prefix in prefixes_to_remove:
625
- if answer.lower().startswith(prefix):
626
- answer = answer[len(prefix):].strip()
627
-
628
- # Ensure proper capitalization
629
- if answer and answer[0].islower():
630
- answer = answer[0].upper() + answer[1:]
631
-
632
- # Add period if missing
633
- if answer and not answer.endswith(('.', '!', '?')):
634
- answer += '.'
635
-
636
- return answer
637
-
638
- async def process_all_questions(self, questions: List[str], namespace: str, doc_processor: DocumentProcessor) -> List[str]:
639
- """Process all questions with better error handling"""
640
- logger.info(f"πŸš€ Processing {len(questions)} questions")
641
-
642
- async def process_single(question: str) -> str:
643
- try:
644
- logger.info(f"❓ Processing: {question}")
645
-
646
- # Search and answer
647
- contexts = await self.search_context(question, namespace, doc_processor)
648
- answer = await self.generate_answer(question, contexts)
649
-
650
- logger.info(f"βœ… Answer for '{question[:30]}...': {answer[:100]}...")
651
- return answer
652
-
653
- except Exception as e:
654
- logger.error(f"❌ Question processing failed: {e}")
655
- return "Answer not found in document."
656
-
657
- # Process questions sequentially for better debugging
658
- answers = []
659
- for question in questions:
660
- answer = await process_single(question)
661
- answers.append(answer)
662
-
663
- return answers
664
-
665
- # Initialize processors
666
- doc_processor = DocumentProcessor()
667
- qa_processor = QAProcessor()
668
-
669
- # API Routes
670
- @app.get("/")
671
- async def root():
672
- return {
673
- "message": "Debug Document QA API",
674
- "version": "5.1.0",
675
- "status": "ready",
676
- "pinecone": "connected" if index else "disconnected"
677
- }
678
-
679
- @app.post("/hackrx/run", response_model=QAResponse)
680
- async def process_qa(request: QARequest):
681
- """Debug QA endpoint with detailed logging"""
682
-
683
- start_time = time.time()
684
- logger.info(f"πŸš€ Starting QA processing for {len(request.questions)} questions")
685
- logger.info(f"πŸ“„ Document URL: {request.documents}")
686
-
687
- try:
688
- # Generate document ID
689
- doc_id = hashlib.md5(request.documents.encode()).hexdigest()[:12]
690
- namespace = f"doc_{doc_id}"
691
- logger.info(f"πŸ†” Document ID: {doc_id}, Namespace: {namespace}")
692
-
693
- # Check if document already processed
694
- doc_exists = False
695
- if index:
696
- try:
697
- # Test query to see if namespace exists
698
- test_result = index.query(
699
- vector=[0.0] * 384,
700
- top_k=1,
701
- namespace=namespace,
702
- include_metadata=False
703
- )
704
- doc_exists = len(test_result.matches) > 0
705
- logger.info(f"πŸ“‹ Document exists in Pinecone: {doc_exists}")
706
- except Exception as e:
707
- logger.error(f"❌ Error checking document existence: {e}")
708
- doc_exists = False
709
-
710
- # Also check memory cache
711
- if not doc_exists and doc_id in doc_processor.cache:
712
- doc_exists = True
713
- namespace = f"memory_{doc_id}"
714
- logger.info(f"πŸ“‹ Document exists in memory cache")
715
-
716
- # Process document if needed
717
- if not doc_exists:
718
- logger.info("πŸ“ Processing new document...")
719
-
720
- # Download and extract
721
- pdf_bytes = doc_processor.download_pdf(request.documents)
722
- text = doc_processor.extract_text(pdf_bytes)
723
-
724
- if len(text) < 100:
725
- raise HTTPException(status_code=400, detail="No meaningful content found in PDF")
726
-
727
- # Create chunks and store
728
- chunks = doc_processor.create_chunks(text)
729
- namespace = await doc_processor.store_in_pinecone(chunks, doc_id)
730
-
731
- if not namespace:
732
- raise HTTPException(status_code=500, detail="Failed to process document")
733
-
734
- logger.info(f"βœ… Document processed in {time.time() - start_time:.2f}s")
735
- else:
736
- logger.info("πŸ“‹ Using cached document")
737
-
738
- # Process all questions
739
- answers = await qa_processor.process_all_questions(request.questions, namespace, doc_processor)
740
-
741
- total_time = time.time() - start_time
742
- logger.info(f"🎯 All processing completed in {total_time:.2f}s")
743
- logger.info(f"πŸ“Š Final answers: {[ans[:50] + '...' if len(ans) > 50 else ans for ans in answers]}")
744
-
745
- return QAResponse(answers=answers)
746
-
747
- except HTTPException:
748
- raise
749
- except Exception as e:
750
- logger.error(f"❌ Processing failed: {e}")
751
- raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
752
-
753
- @app.get("/debug/stats")
754
- async def debug_stats():
755
- """Debug endpoint to check system status"""
756
- stats = {
757
- "pinecone_connected": index is not None,
758
- "embedding_model": str(embedding_model),
759
- "cache_size": len(doc_processor.cache),
760
- "answer_cache_size": len(qa_processor.answer_cache)
761
- }
762
-
763
- if index:
764
- try:
765
- index_stats = index.describe_index_stats()
766
- stats["index_stats"] = index_stats
767
- except Exception as e:
768
- stats["index_error"] = str(e)
769
-
770
- return stats
771
-
772
- @app.delete("/debug/clear")
773
- async def clear_all_cache():
774
- """Clear all caches and namespaces"""
775
- doc_processor.cache.clear()
776
- qa_processor.answer_cache.clear()
777
-
778
- # Optionally clear Pinecone namespaces (be careful!)
779
- # if index:
780
- # try:
781
- # index.delete(delete_all=True)
782
- # except Exception as e:
783
- # pass
784
-
785
- return {"message": "All caches cleared"}
786
-
787
- @app.get("/health")
788
- async def health():
789
- return {
790
- "status": "healthy",
791
- "pinecone": "connected" if index else "disconnected",
792
- "gemini": "configured"
793
- }
794
-
795
- if _name_ == "_main_":
796
- import uvicorn
797
-
798
- print("πŸš€ Starting DEBUG Document QA API...")
799
- print("πŸ” Debug features enabled:")
800
- print(" - Detailed logging")
801
- print(" - Memory fallback for Pinecone")
802
- print(" - Enhanced rule-based matching")
803
- print(" - Debug endpoints (/debug/stats, /debug/clear)")
804
- print(" - Lower similarity thresholds")
805
-
806
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ import os
2
+ import logging
3
+ import requests
4
+ import fitz # PyMuPDF
5
+ import google.generativeai as genai
6
+ from fastapi import FastAPI, HTTPException
7
+ from pydantic import BaseModel, validator
8
+ from typing import List, Dict
9
+ import re
10
+ import asyncio
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ import time
13
+ import numpy as np
14
+ from sentence_transformers import SentenceTransformer
15
+ import hashlib
16
+ from pinecone import Pinecone, ServerlessSpec
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Initialize FastAPI
23
+ app = FastAPI(title="Debug Document QA API", version="5.1.0")
24
+
25
+ # Configuration
26
+ GEMINI_API_KEY = "AIzaSyBPa-4UMLTi81OgKUhTBuqczGzaKec4zP4"
27
+ PINECONE_API_KEY = "pcsk_7M5Zsf_84MeAJ4hBxCMN5z4AT3gkNNnTqqicAzA5A6o5m9XViUkCFRTjsk46FVc6mKiynD"
28
+ INDEX_NAME = "qa-fast-v2"
29
+
30
+ # Initialize services
31
+ genai.configure(api_key=GEMINI_API_KEY)
32
+ model = genai.GenerativeModel('gemini-2.0-flash')
33
+
34
+ # Lightweight embedding model
35
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
36
+ embedding_model.max_seq_length = 256
37
+
38
+ # Initialize Pinecone
39
+ try:
40
+ pc = Pinecone(api_key=PINECONE_API_KEY)
41
+
42
+ # Create index if doesn't exist
43
+ if INDEX_NAME not in pc.list_indexes().names():
44
+ pc.create_index(
45
+ name=INDEX_NAME,
46
+ dimension=384,
47
+ metric='cosine',
48
+ spec=ServerlessSpec(cloud='aws', region='us-east-1')
49
+ )
50
+ time.sleep(5)
51
+
52
+ index = pc.Index(INDEX_NAME)
53
+ logger.info("βœ… Pinecone connected successfully")
54
+ except Exception as e:
55
+ logger.error(f"❌ Pinecone failed: {e}")
56
+ index = None
57
+
58
+ executor = ThreadPoolExecutor(max_workers=4)
59
+
60
+ # Models
61
+ class QARequest(BaseModel):
62
+ documents: str
63
+ questions: List[str]
64
+
65
+ @validator('documents')
66
+ def validate_url(cls, v):
67
+ # Remove query parameters for extension check
68
+ base_url = v.split('?')[0]
69
+ if not base_url.lower().endswith('.pdf'):
70
+ raise ValueError('Must be PDF URL')
71
+ return v
72
+
73
+ class QAResponse(BaseModel):
74
+ answers: List[str]
75
+
76
+ # Document processor
77
+ class DocumentProcessor:
78
+ def __init__(self):
79
+ self.cache = {}
80
+
81
+ def download_pdf(self, url: str) -> bytes:
82
+ """Internal PDF download method with better error handling"""
83
+ try:
84
+ logger.info(f"πŸ“₯ Downloading PDF from: {url}")
85
+
86
+ headers = {
87
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
88
+ 'Accept': 'application/pdf,application/octet-stream,/'
89
+ }
90
+
91
+ response = requests.get(url, headers=headers, timeout=30, stream=True)
92
+ response.raise_for_status()
93
+
94
+ content = response.content
95
+ logger.info(f"πŸ“„ Downloaded {len(content)} bytes")
96
+ return content
97
+
98
+ except Exception as e:
99
+ logger.error(f"❌ Failed to download PDF: {str(e)}")
100
+ raise HTTPException(status_code=400, detail=f"Failed to download PDF: {str(e)}")
101
+
102
+ def extract_text(self, pdf_bytes: bytes) -> str:
103
+ """Extract text with better debugging"""
104
+ try:
105
+ logger.info(f"πŸ“– Extracting text from {len(pdf_bytes)} bytes PDF")
106
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
107
+
108
+ text_parts = []
109
+ max_pages = min(doc.page_count, 50) # Process more pages
110
+
111
+ for page_num in range(max_pages):
112
+ page = doc.load_page(page_num)
113
+ text = page.get_text()
114
+ if text.strip():
115
+ text_parts.append(text)
116
+ logger.info(f"Page {page_num + 1}: {len(text)} characters")
117
+
118
+ doc.close()
119
+
120
+ # Combine and clean
121
+ full_text = ' '.join(text_parts)
122
+ full_text = re.sub(r'\s+', ' ', full_text.strip())
123
+ full_text = re.sub(r'Page \d+', '', full_text, flags=re.IGNORECASE)
124
+
125
+ logger.info(f"πŸ“„ Total extracted text: {len(full_text)} characters from {max_pages} pages")
126
+
127
+ # Log sample text for debugging
128
+ sample_text = full_text[:500] if len(full_text) > 500 else full_text
129
+ logger.info(f"πŸ“ Sample text: {sample_text}...")
130
+
131
+ return full_text
132
+
133
+ except Exception as e:
134
+ logger.error(f"❌ Text extraction failed: {e}")
135
+ raise HTTPException(status_code=500, detail=f"Cannot extract text: {e}")
136
+
137
+ def create_chunks(self, text: str) -> List[Dict]:
138
+ """Create chunks with better debugging"""
139
+ logger.info(f"πŸ”ͺ Creating chunks from {len(text)} characters")
140
+
141
+ chunk_size = 1000 # Smaller chunks for better matching
142
+ overlap = 150
143
+
144
+ # Split into sentences
145
+ sentences = re.split(r'(?<=[.!?])\s+', text)
146
+ logger.info(f"πŸ“‹ Found {len(sentences)} sentences")
147
+
148
+ chunks = []
149
+ current_chunk = []
150
+ current_length = 0
151
+
152
+ for sentence in sentences:
153
+ sentence = sentence.strip()
154
+ if len(sentence) < 15: # Skip very short sentences
155
+ continue
156
+
157
+ if current_length + len(sentence) > chunk_size and current_chunk:
158
+ # Create chunk
159
+ chunk_text = ' '.join(current_chunk)
160
+ chunks.append({
161
+ 'text': chunk_text,
162
+ 'id': f"chunk_{len(chunks)}"
163
+ })
164
+
165
+ # Keep some overlap
166
+ if len(current_chunk) > 1:
167
+ current_chunk = current_chunk[-1:] + [sentence]
168
+ current_length = sum(len(s) for s in current_chunk)
169
+ else:
170
+ current_chunk = [sentence]
171
+ current_length = len(sentence)
172
+ else:
173
+ current_chunk.append(sentence)
174
+ current_length += len(sentence)
175
+
176
+ # Add final chunk
177
+ if current_chunk:
178
+ chunk_text = ' '.join(current_chunk)
179
+ chunks.append({
180
+ 'text': chunk_text,
181
+ 'id': f"chunk_{len(chunks)}"
182
+ })
183
+
184
+ logger.info(f"πŸ“Š Created {len(chunks)} chunks")
185
+
186
+ # Log sample chunks for debugging
187
+ for i, chunk in enumerate(chunks[:3]):
188
+ logger.info(f"Chunk {i}: {chunk['text'][:100]}...")
189
+
190
+ return chunks
191
+
192
+ async def store_in_pinecone(self, chunks: List[Dict], doc_id: str):
193
+ """Store chunks in Pinecone with debugging"""
194
+ if not index:
195
+ logger.error("❌ Pinecone not available - storing chunks in memory fallback")
196
+ # Store in memory as fallback
197
+ self.cache[doc_id] = chunks
198
+ return f"memory_{doc_id}"
199
+
200
+ try:
201
+ namespace = f"doc_{doc_id}"
202
+ logger.info(f"πŸ’Ύ Storing {len(chunks)} chunks in Pinecone namespace: {namespace}")
203
+
204
+ # Create embeddings in batch
205
+ texts = [chunk['text'] for chunk in chunks]
206
+ logger.info("🧠 Creating embeddings...")
207
+ embeddings = embedding_model.encode(texts, batch_size=16, show_progress_bar=False)
208
+ logger.info(f"βœ… Created {len(embeddings)} embeddings, dimension: {len(embeddings[0])}")
209
+
210
+ # Prepare vectors
211
+ vectors = []
212
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
213
+ vectors.append({
214
+ 'id': f"{doc_id}_{chunk['id']}",
215
+ 'values': embedding.tolist(),
216
+ 'metadata': {
217
+ 'text': chunk['text'][:1000], # Limit metadata size
218
+ 'chunk_id': chunk['id']
219
+ }
220
+ })
221
+
222
+ # Upsert in batches
223
+ batch_size = 50
224
+ for i in range(0, len(vectors), batch_size):
225
+ batch = vectors[i:i + batch_size]
226
+ result = index.upsert(vectors=batch, namespace=namespace)
227
+ logger.info(f"πŸ“€ Upserted batch {i//batch_size + 1}: {result}")
228
+
229
+ # Verify storage
230
+ time.sleep(2) # Wait for indexing
231
+ stats = index.describe_index_stats()
232
+ logger.info(f"πŸ“Š Index stats: {stats}")
233
+
234
+ return namespace
235
+
236
+ except Exception as e:
237
+ logger.error(f"❌ Pinecone storage failed: {e}")
238
+ # Fallback to memory storage
239
+ self.cache[doc_id] = chunks
240
+ return f"memory_{doc_id}"
241
+
242
+ # QA Processor
243
+ class QAProcessor:
244
+ def __init__(self):
245
+ self.answer_cache = {}
246
+
247
+ async def search_context(self, question: str, namespace: str, doc_processor: DocumentProcessor) -> List[str]:
248
+ """Enhanced context search with memory fallback"""
249
+ logger.info(f"πŸ” Searching for: '{question}' in namespace: {namespace}")
250
+
251
+ # Check if using memory fallback
252
+ if namespace.startswith("memory_"):
253
+ doc_id = namespace.replace("memory_", "")
254
+ if doc_id in doc_processor.cache:
255
+ chunks = doc_processor.cache[doc_id]
256
+ logger.info(f"🧠 Using memory fallback with {len(chunks)} chunks")
257
+
258
+ # Simple keyword matching for memory fallback
259
+ question_words = set(question.lower().split())
260
+ scored_chunks = []
261
+
262
+ for chunk in chunks:
263
+ chunk_words = set(chunk['text'].lower().split())
264
+ overlap = len(question_words.intersection(chunk_words))
265
+ if overlap > 0:
266
+ scored_chunks.append((chunk['text'], overlap))
267
+
268
+ scored_chunks.sort(key=lambda x: x[1], reverse=True)
269
+ contexts = [chunk for chunk, _ in scored_chunks[:8]]
270
+ logger.info(f"πŸ“‹ Found {len(contexts)} relevant chunks via memory search")
271
+ return contexts
272
+
273
+ if not index:
274
+ logger.error("❌ Both Pinecone and memory fallback failed")
275
+ return []
276
+
277
+ try:
278
+ # Create query embedding
279
+ logger.info("🧠 Creating query embedding...")
280
+ query_embedding = embedding_model.encode([question])[0]
281
+ logger.info(f"βœ… Query embedding created: dimension {len(query_embedding)}")
282
+
283
+ # Search in Pinecone
284
+ logger.info(f"πŸ” Querying Pinecone in namespace: {namespace}")
285
+ results = index.query(
286
+ vector=query_embedding.tolist(),
287
+ top_k=15,
288
+ namespace=namespace,
289
+ include_metadata=True
290
+ )
291
+
292
+ logger.info(f"πŸ“Š Pinecone returned {len(results.matches)} matches")
293
+
294
+ # Log match scores for debugging
295
+ for i, match in enumerate(results.matches[:5]):
296
+ logger.info(f"Match {i}: score={match.score:.4f}, text={match.metadata['text'][:100]}...")
297
+
298
+ # Collect contexts with lower threshold
299
+ contexts = []
300
+ for match in results.matches:
301
+ if match.score > 0.1: # Lower threshold
302
+ contexts.append(match.metadata['text'])
303
+
304
+ logger.info(f"πŸ“‹ Selected {len(contexts)} contexts above threshold")
305
+ return contexts
306
+
307
+ except Exception as e:
308
+ logger.error(f"❌ Search failed: {e}")
309
+ return []
310
+
311
+ async def generate_answer(self, question: str, contexts: List[str]) -> str:
312
+ """Generate concise, accurate answers"""
313
+ logger.info(f"πŸ€– Generating answer for: '{question}' with {len(contexts)} contexts")
314
+
315
+ if not contexts:
316
+ logger.warning("⚠ No contexts found - trying direct text search")
317
+ return "Answer not found in document."
318
+
319
+ # Combine contexts intelligently
320
+ combined_context = '\n'.join(contexts[:8])
321
+ if len(combined_context) > 5000:
322
+ combined_context = combined_context[:5000]
323
+
324
+ logger.info(f"πŸ“ Combined context length: {len(combined_context)}")
325
+
326
+ # Enhanced prompt for better extraction
327
+ prompt = f"""You are an expert at extracting specific information from insurance policy documents.
328
+
329
+ Your task: Find the exact answer to the question from the policy document below. Be concise and specific.
330
+
331
+ Insurance Policy Document:
332
+ {combined_context}
333
+
334
+ Question: {question}
335
+
336
+ Instructions:
337
+ - Give a direct, concise answer
338
+ - Include specific numbers, periods, percentages when mentioned
339
+ - If there are conditions, mention the key ones briefly
340
+ - Don't start with "Based on" or "According to"
341
+ - Keep answer under 100 words
342
+ - If no relevant information exists, say "Answer not found in document"
343
+
344
+ Answer:"""
345
+
346
+ try:
347
+ logger.info("πŸ€– Calling Gemini...")
348
+ response = await asyncio.wait_for(
349
+ asyncio.get_event_loop().run_in_executor(
350
+ executor,
351
+ lambda: model.generate_content(
352
+ prompt,
353
+ generation_config=genai.types.GenerationConfig(
354
+ temperature=0.05, # Very low for consistency
355
+ max_output_tokens=150, # Shorter answers
356
+ candidate_count=1,
357
+ top_p=0.9
358
+ )
359
+ )
360
+ ),
361
+ timeout=15.0
362
+ )
363
+
364
+ answer = response.text.strip()
365
+ logger.info(f"βœ… Generated answer: {answer[:100]}...")
366
+
367
+ # Clean answer
368
+ answer = self.clean_answer(answer)
369
+
370
+ # Enhanced validation
371
+ if self.is_valid_answer(answer, question):
372
+ return answer
373
+
374
+ # Try rule-based extraction for specific patterns
375
+ logger.info("πŸ›  Trying enhanced rule-based extraction...")
376
+ rule_answer = self.enhanced_rule_extraction(question, combined_context)
377
+ if rule_answer != "Answer not found in document.":
378
+ return rule_answer
379
+
380
+ # Final fallback - return AI answer if it's not completely empty
381
+ if answer and len(answer) > 10 and "not found" not in answer.lower():
382
+ return answer
383
+
384
+ return "Answer not found in document."
385
+
386
+ except Exception as e:
387
+ logger.error(f"❌ Generation failed: {e}")
388
+ return self.enhanced_rule_extraction(question, combined_context)
389
+
390
+ def clean_answer(self, answer: str) -> str:
391
+ """Clean and format answer"""
392
+ if not answer:
393
+ return ""
394
+
395
+ # Remove common prefixes
396
+ prefixes = [
397
+ "ANSWER:", "Based on the", "According to", "The context",
398
+ "The document", "From the policy", "Answer:"
399
+ ]
400
+
401
+ for prefix in prefixes:
402
+ if answer.startswith(prefix):
403
+ answer = answer[len(prefix):].strip()
404
+ if answer.startswith(':'):
405
+ answer = answer[1:].strip()
406
+
407
+ # Clean formatting
408
+ answer = re.sub(r'\s+', ' ', answer.strip())
409
+
410
+ # Capitalize first letter
411
+ if answer and answer[0].islower():
412
+ answer = answer[0].upper() + answer[1:]
413
+
414
+ return answer
415
+
416
+ def is_valid_answer(self, answer: str, question: str) -> bool:
417
+ """Check if answer is valid and relevant"""
418
+ if not answer or len(answer) < 5:
419
+ return False
420
+
421
+ # Check for non-answers
422
+ non_answers = [
423
+ "answer not found", "not mentioned", "does not contain",
424
+ "no information", "cannot be determined", "not specified"
425
+ ]
426
+
427
+ if any(phrase in answer.lower() for phrase in non_answers):
428
+ return False
429
+
430
+ # Check if answer contains relevant keywords from question
431
+ question_words = set(question.lower().split())
432
+ answer_words = set(answer.lower().split())
433
+
434
+ # Should have some overlap
435
+ overlap = len(question_words.intersection(answer_words))
436
+ return overlap >= 1
437
+
438
+ def enhanced_rule_extraction(self, question: str, context: str) -> str:
439
+ """Enhanced rule-based extraction for insurance-specific queries"""
440
+ logger.info(f"πŸ›  Enhanced rule extraction for: '{question}'")
441
+
442
+ q_lower = question.lower()
443
+
444
+ # Define comprehensive patterns for insurance terms
445
+ insurance_patterns = {
446
+ 'grace period': {
447
+ 'patterns': [
448
+ r'grace period.?(\d+)\s(days?|months?)',
449
+ r'(\d+)\s*days?\s*grace\s*period',
450
+ r'premium.?grace.?(\d+)\s*days?',
451
+ r'grace.*?(\d+)\s*days?'
452
+ ],
453
+ 'extract_sentence': True
454
+ },
455
+ 'waiting period': {
456
+ 'patterns': [
457
+ r'waiting period.?(\d+)\s(days?|months?|years?)',
458
+ r'(\d+)\s*(days?|months?|years?).*?waiting\s*period',
459
+ r'pre.?existing.?(\d+)\s*(months?|years?)',
460
+ r'(\d+)\s*months?.*?continuous\s*coverage'
461
+ ],
462
+ 'extract_sentence': True
463
+ },
464
+ 'maternity': {
465
+ 'patterns': [
466
+ r'maternity.?(\d+)\s(months?|years?)',
467
+ r'(\d+)\s*months?.*?maternity',
468
+ r'pregnancy.*?(\d+)\s*months?',
469
+ r'childbirth.*?(\d+)\s*months?',
470
+ r'continuous.?covered.?(\d+)\s*months?'
471
+ ],
472
+ 'extract_full': True
473
+ },
474
+ 'cataract': {
475
+ 'patterns': [
476
+ r'cataract.?(\d+)\s(years?|months?)',
477
+ r'(\d+)\s*years?.*?cataract',
478
+ r'eye.?surgery.?(\d+)\s*years?',
479
+ r'cataract.?waiting.?(\d+)'
480
+ ],
481
+ 'extract_sentence': True
482
+ },
483
+ 'ncd|no claim discount': {
484
+ 'patterns': [
485
+ r'no claim discount.*?(\d+)%',
486
+ r'ncd.*?(\d+)%',
487
+ r'(\d+)%.*?no claim',
488
+ r'cumulative bonus.*?(\d+)%',
489
+ r'(\d+)%.?claim.?free'
490
+ ],
491
+ 'extract_sentence': True
492
+ },
493
+ 'room rent|icu': {
494
+ 'patterns': [
495
+ r'room rent.*?(\d+)%',
496
+ r'icu.*?(\d+)%',
497
+ r'(\d+)%.*?room rent',
498
+ r'(\d+)%.*?sum insured'
499
+ ],
500
+ 'extract_sentence': True
501
+ },
502
+ 'ayush': {
503
+ 'patterns': [
504
+ r'ayurveda.?yoga.?naturopathy',
505
+ r'ayush.*?hospital',
506
+ r'unani.?siddha.?homeopathy'
507
+ ],
508
+ 'extract_full': True
509
+ },
510
+ 'hospital': {
511
+ 'patterns': [
512
+ r'hospital.?means.?institution',
513
+ r'(\d+).*?inpatient beds',
514
+ r'qualified nursing staff'
515
+ ],
516
+ 'extract_full': True
517
+ }
518
+ }
519
+
520
+ # Find relevant pattern category
521
+ for key, config in insurance_patterns.items():
522
+ if any(word in q_lower for word in key.split('|')):
523
+ logger.info(f"πŸ” Checking patterns for: {key}")
524
+
525
+ for pattern in config['patterns']:
526
+ matches = list(re.finditer(pattern, context, re.IGNORECASE))
527
+ if matches:
528
+ logger.info(f"βœ… Pattern matched: {pattern}")
529
+
530
+ # Extract based on configuration
531
+ if config.get('extract_full'):
532
+ # Extract larger context around match
533
+ match = matches[0]
534
+ start = max(0, match.start() - 200)
535
+ end = min(len(context), match.end() + 200)
536
+ full_context = context[start:end]
537
+
538
+ # Find complete sentences
539
+ sentences = re.split(r'[.!?]+', full_context)
540
+ relevant_sentences = []
541
+
542
+ for sentence in sentences:
543
+ if (re.search(pattern, sentence, re.IGNORECASE) or
544
+ any(word in sentence.lower() for word in key.split('|'))):
545
+ relevant_sentences.append(sentence.strip())
546
+
547
+ if relevant_sentences:
548
+ result = '. '.join(relevant_sentences[:2])
549
+ return self.clean_extracted_answer(result)
550
+
551
+ else: # extract_sentence
552
+ # Find the sentence containing the match
553
+ match = matches[0]
554
+ # Expand search area
555
+ start = max(0, match.start() - 150)
556
+ end = min(len(context), match.end() + 150)
557
+ sentence_area = context[start:end]
558
+
559
+ sentences = re.split(r'[.!?]+', sentence_area)
560
+ for sentence in sentences:
561
+ if re.search(pattern, sentence, re.IGNORECASE) and len(sentence.strip()) > 15:
562
+ result = sentence.strip()
563
+ return self.clean_extracted_answer(result)
564
+
565
+ # Fallback: keyword-based extraction
566
+ return self.keyword_based_extraction(question, context)
567
+
568
+ def keyword_based_extraction(self, question: str, context: str) -> str:
569
+ """Extract answer based on keyword matching"""
570
+ question_keywords = [word.lower() for word in question.split() if len(word) > 3]
571
+
572
+ if not question_keywords:
573
+ return "Answer not found in document."
574
+
575
+ sentences = re.split(r'[.!?]+', context)
576
+ scored_sentences = []
577
+
578
+ for sentence in sentences:
579
+ sentence = sentence.strip()
580
+ if len(sentence) < 20:
581
+ continue
582
+
583
+ sentence_lower = sentence.lower()
584
+ score = 0
585
+
586
+ # Count keyword matches
587
+ for keyword in question_keywords:
588
+ if keyword in sentence_lower:
589
+ score += 1
590
+
591
+ # Bonus for numbers (common in insurance)
592
+ if re.search(r'\d+', sentence):
593
+ score += 0.5
594
+
595
+ # Bonus for insurance terms
596
+ insurance_terms = ['policy', 'coverage', 'benefit', 'premium', 'claim', 'period', 'limit']
597
+ for term in insurance_terms:
598
+ if term in sentence_lower:
599
+ score += 0.3
600
+
601
+ if score >= 1.5: # Threshold for relevance
602
+ scored_sentences.append((sentence, score))
603
+
604
+ if scored_sentences:
605
+ # Sort by score and return best match
606
+ scored_sentences.sort(key=lambda x: x[1], reverse=True)
607
+ best_sentence = scored_sentences[0][0]
608
+ return self.clean_extracted_answer(best_sentence)
609
+
610
+ return "Answer not found in document."
611
+
612
+ def clean_extracted_answer(self, answer: str) -> str:
613
+ """Clean extracted answers"""
614
+ if not answer:
615
+ return ""
616
+
617
+ # Remove common prefixes and suffixes
618
+ prefixes_to_remove = [
619
+ "however,", "therefore,", "moreover,", "furthermore,",
620
+ "in addition,", "also,", "but,", "and,"
621
+ ]
622
+
623
+ answer = answer.strip()
624
+ for prefix in prefixes_to_remove:
625
+ if answer.lower().startswith(prefix):
626
+ answer = answer[len(prefix):].strip()
627
+
628
+ # Ensure proper capitalization
629
+ if answer and answer[0].islower():
630
+ answer = answer[0].upper() + answer[1:]
631
+
632
+ # Add period if missing
633
+ if answer and not answer.endswith(('.', '!', '?')):
634
+ answer += '.'
635
+
636
+ return answer
637
+
638
+ async def process_all_questions(self, questions: List[str], namespace: str, doc_processor: DocumentProcessor) -> List[str]:
639
+ """Process all questions with better error handling"""
640
+ logger.info(f"πŸš€ Processing {len(questions)} questions")
641
+
642
+ async def process_single(question: str) -> str:
643
+ try:
644
+ logger.info(f"❓ Processing: {question}")
645
+
646
+ # Search and answer
647
+ contexts = await self.search_context(question, namespace, doc_processor)
648
+ answer = await self.generate_answer(question, contexts)
649
+
650
+ logger.info(f"βœ… Answer for '{question[:30]}...': {answer[:100]}...")
651
+ return answer
652
+
653
+ except Exception as e:
654
+ logger.error(f"❌ Question processing failed: {e}")
655
+ return "Answer not found in document."
656
+
657
+ # Process questions sequentially for better debugging
658
+ answers = []
659
+ for question in questions:
660
+ answer = await process_single(question)
661
+ answers.append(answer)
662
+
663
+ return answers
664
+
665
+ # Initialize processors
666
+ doc_processor = DocumentProcessor()
667
+ qa_processor = QAProcessor()
668
+
669
+ # API Routes
670
+ @app.get("/")
671
+ async def root():
672
+ return {
673
+ "message": "Debug Document QA API",
674
+ "version": "5.1.0",
675
+ "status": "ready",
676
+ "pinecone": "connected" if index else "disconnected"
677
+ }
678
+
679
+ @app.post("/hackrx/run", response_model=QAResponse)
680
+ async def process_qa(request: QARequest):
681
+ """Debug QA endpoint with detailed logging"""
682
+
683
+ start_time = time.time()
684
+ logger.info(f"πŸš€ Starting QA processing for {len(request.questions)} questions")
685
+ logger.info(f"πŸ“„ Document URL: {request.documents}")
686
+
687
+ try:
688
+ # Generate document ID
689
+ doc_id = hashlib.md5(request.documents.encode()).hexdigest()[:12]
690
+ namespace = f"doc_{doc_id}"
691
+ logger.info(f"πŸ†” Document ID: {doc_id}, Namespace: {namespace}")
692
+
693
+ # Check if document already processed
694
+ doc_exists = False
695
+ if index:
696
+ try:
697
+ # Test query to see if namespace exists
698
+ test_result = index.query(
699
+ vector=[0.0] * 384,
700
+ top_k=1,
701
+ namespace=namespace,
702
+ include_metadata=False
703
+ )
704
+ doc_exists = len(test_result.matches) > 0
705
+ logger.info(f"πŸ“‹ Document exists in Pinecone: {doc_exists}")
706
+ except Exception as e:
707
+ logger.error(f"❌ Error checking document existence: {e}")
708
+ doc_exists = False
709
+
710
+ # Also check memory cache
711
+ if not doc_exists and doc_id in doc_processor.cache:
712
+ doc_exists = True
713
+ namespace = f"memory_{doc_id}"
714
+ logger.info(f"πŸ“‹ Document exists in memory cache")
715
+
716
+ # Process document if needed
717
+ if not doc_exists:
718
+ logger.info("πŸ“ Processing new document...")
719
+
720
+ # Download and extract
721
+ pdf_bytes = doc_processor.download_pdf(request.documents)
722
+ text = doc_processor.extract_text(pdf_bytes)
723
+
724
+ if len(text) < 100:
725
+ raise HTTPException(status_code=400, detail="No meaningful content found in PDF")
726
+
727
+ # Create chunks and store
728
+ chunks = doc_processor.create_chunks(text)
729
+ namespace = await doc_processor.store_in_pinecone(chunks, doc_id)
730
+
731
+ if not namespace:
732
+ raise HTTPException(status_code=500, detail="Failed to process document")
733
+
734
+ logger.info(f"βœ… Document processed in {time.time() - start_time:.2f}s")
735
+ else:
736
+ logger.info("πŸ“‹ Using cached document")
737
+
738
+ # Process all questions
739
+ answers = await qa_processor.process_all_questions(request.questions, namespace, doc_processor)
740
+
741
+ total_time = time.time() - start_time
742
+ logger.info(f"🎯 All processing completed in {total_time:.2f}s")
743
+ logger.info(f"πŸ“Š Final answers: {[ans[:50] + '...' if len(ans) > 50 else ans for ans in answers]}")
744
+
745
+ return QAResponse(answers=answers)
746
+
747
+ except HTTPException:
748
+ raise
749
+ except Exception as e:
750
+ logger.error(f"❌ Processing failed: {e}")
751
+ raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
752
+
753
+ @app.get("/debug/stats")
754
+ async def debug_stats():
755
+ """Debug endpoint to check system status"""
756
+ stats = {
757
+ "pinecone_connected": index is not None,
758
+ "embedding_model": str(embedding_model),
759
+ "cache_size": len(doc_processor.cache),
760
+ "answer_cache_size": len(qa_processor.answer_cache)
761
+ }
762
+
763
+ if index:
764
+ try:
765
+ index_stats = index.describe_index_stats()
766
+ stats["index_stats"] = index_stats
767
+ except Exception as e:
768
+ stats["index_error"] = str(e)
769
+
770
+ return stats
771
+
772
+ @app.delete("/debug/clear")
773
+ async def clear_all_cache():
774
+ """Clear all caches and namespaces"""
775
+ doc_processor.cache.clear()
776
+ qa_processor.answer_cache.clear()
777
+
778
+ # Optionally clear Pinecone namespaces (be careful!)
779
+ # if index:
780
+ # try:
781
+ # index.delete(delete_all=True)
782
+ # except Exception as e:
783
+ # pass
784
+
785
+ return {"message": "All caches cleared"}
786
+
787
+ @app.get("/health")
788
+ async def health():
789
+ return {
790
+ "status": "healthy",
791
+ "pinecone": "connected" if index else "disconnected",
792
+ "gemini": "configured"
793
+ }
794
+
795
+ if _name_ == "__main__":
796
+ import uvicorn
797
+
798
+ print("πŸš€ Starting DEBUG Document QA API...")
799
+ print("πŸ” Debug features enabled:")
800
+ print(" - Detailed logging")
801
+ print(" - Memory fallback for Pinecone")
802
+ print(" - Enhanced rule-based matching")
803
+ print(" - Debug endpoints (/debug/stats, /debug/clear)")
804
+ print(" - Lower similarity thresholds")
805
+
806
  uvicorn.run(app, host="0.0.0.0", port=8000)