sohamchitimali commited on
Commit
9e93043
·
1 Parent(s): d6beeea

Deployment Fixes

Browse files
Files changed (2) hide show
  1. app.py +387 -585
  2. requirements.txt +11 -11
app.py CHANGED
@@ -38,7 +38,7 @@ async def hackrx_run(
38
  ):
39
  try:
40
  data = await request.json()
41
- documents = data.get("documents") # Single URL expected
42
  questions = data.get("questions")
43
 
44
  if not documents or not questions:
@@ -49,7 +49,7 @@ async def hackrx_run(
49
 
50
  # Handle single document URL
51
  if isinstance(documents, list):
52
- document_url = documents[0] # Take first document only
53
  else:
54
  document_url = documents
55
 
@@ -65,6 +65,7 @@ async def hackrx_run(
65
  return JSONResponse(content={"answers": answers}, status_code=200)
66
 
67
  except Exception as e:
 
68
  return JSONResponse(content={"error": str(e)}, status_code=500)
69
 
70
  @dataclass
@@ -106,7 +107,7 @@ class EnhancedDocumentProcessor:
106
  page_text = page.extract_text()
107
  if page_text:
108
  cleaned_text = self._clean_text_comprehensive(page_text)
109
- if len(cleaned_text.strip()) > 50:
110
  pages_content.append({
111
  'page_num': page_num + 1,
112
  'text': cleaned_text,
@@ -125,10 +126,12 @@ class EnhancedDocumentProcessor:
125
  'source_url': source_url
126
  }
127
 
 
128
  if len(self.cache) >= self.max_cache_size:
129
  self.cache.pop(next(iter(self.cache)))
130
  self.cache[cache_key] = result
131
 
 
132
  return result
133
 
134
  except Exception as e:
@@ -145,11 +148,11 @@ class EnhancedDocumentProcessor:
145
  for para in doc.paragraphs:
146
  if para.text.strip():
147
  cleaned_text = self._clean_text_comprehensive(para.text)
148
- if len(cleaned_text.strip()) > 20:
149
  paragraphs.append(cleaned_text)
150
  full_text += " " + cleaned_text
151
 
152
- return {
153
  'pages': [{'page_num': 1, 'text': full_text, 'word_count': len(full_text.split())}],
154
  'full_text': full_text.strip(),
155
  'total_pages': 1,
@@ -158,6 +161,9 @@ class EnhancedDocumentProcessor:
158
  'source_url': source_url
159
  }
160
 
 
 
 
161
  except Exception as e:
162
  logger.error(f"DOCX extraction error: {e}")
163
  return {'pages': [], 'full_text': '', 'total_pages': 0, 'total_words': 0, 'source_url': source_url}
@@ -167,37 +173,29 @@ class EnhancedDocumentProcessor:
167
  if not text:
168
  return ""
169
 
170
- # Basic cleaning
171
  text = re.sub(r'\s+', ' ', text.strip())
172
 
173
  # Fix spacing around punctuation
174
  text = re.sub(r'\s+([.,:;!?])', r'\1', text)
175
  text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
176
 
177
- # Fix spacing around numbers
178
- text = re.sub(r'(\d+)([A-Za-z])', r'\1 \2', text)
179
- text = re.sub(r'([A-Za-z])(\d+)', r'\1 \2', text)
180
-
181
- # Normalize common insurance terms
182
  text = re.sub(r'(\d+)\s*months?', r'\1 months', text, flags=re.IGNORECASE)
183
  text = re.sub(r'(\d+)\s*days?', r'\1 days', text, flags=re.IGNORECASE)
184
  text = re.sub(r'(\d+)\s*years?', r'\1 years', text, flags=re.IGNORECASE)
185
- text = re.sub(r'Rs\.?\s*(\d+)', r'Rs. \1', text, flags=re.IGNORECASE)
186
-
187
- # Remove page numbers and headers/footers
188
- text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
189
- text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
190
- text = re.sub(r'^[-\s]*$', '', text, flags=re.MULTILINE)
191
 
192
- # Fix camelCase words
193
- text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
 
 
194
 
195
  return text.strip()
196
 
197
  class EnhancedChunker:
198
  """Enhanced chunking with better context preservation"""
199
 
200
- def __init__(self, chunk_size: int = 400, overlap: int = 100, min_chunk_size: int = 120):
201
  self.chunk_size = chunk_size
202
  self.overlap = overlap
203
  self.min_chunk_size = min_chunk_size
@@ -212,79 +210,44 @@ class EnhancedChunker:
212
  if not full_text:
213
  return chunks
214
 
215
- # First, try to split by logical sections (headings, numbered items, etc.)
216
- sections = self._identify_sections(full_text)
217
-
218
- for section_text in sections:
219
- section_chunks = self._chunk_section(section_text, chunk_id)
220
- chunks.extend(section_chunks)
221
- chunk_id += len(section_chunks)
222
-
223
- # If no sections found, fall back to paragraph-based chunking
224
- if not chunks:
225
- chunks = self._chunk_by_paragraphs(full_text, chunk_id)
226
-
227
- logger.info(f"Created {len(chunks)} chunks from document")
228
- return chunks
229
-
230
- def _identify_sections(self, text: str) -> List[str]:
231
- """Identify logical sections in the text"""
232
- # Look for common insurance document patterns
233
- section_patterns = [
234
- r'\n\s*(?:SECTION|Section|ARTICLE|Article|CLAUSE|Clause)\s+[\dIVXLC]+[.\s]+[^\n]+',
235
- r'\n\s*\d+\.\s*[A-Z][^\n]+', # Numbered headings
236
- r'\n\s*[A-Z][A-Z\s]{10,}:', # All caps headings
237
- r'\n\s*(?:Benefits|Coverage|Exclusions|Conditions|Definitions)[^\n]*:',
238
- ]
239
-
240
- # Try to split by sections
241
- for pattern in section_patterns:
242
- matches = list(re.finditer(pattern, text, re.IGNORECASE))
243
- if len(matches) >= 2: # At least 2 sections
244
- sections = []
245
- for i, match in enumerate(matches):
246
- start = match.start()
247
- end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
248
- section_text = text[start:end].strip()
249
- if len(section_text) > 100: # Meaningful section size
250
- sections.append(section_text)
251
-
252
- if sections:
253
- return sections
254
 
255
- return [] # No clear sections found
256
-
257
- def _chunk_section(self, section_text: str, start_chunk_id: int) -> List[DocumentChunk]:
258
- """Chunk a single section"""
259
- chunks = []
260
- chunk_id = start_chunk_id
261
 
262
- # Split section into sentences
263
- sentences = re.split(r'[.!?]+\s+', section_text)
264
- sentences = [s.strip() + '.' for s in sentences if s.strip()]
265
 
266
  current_chunk = ""
267
  current_words = 0
268
 
269
- for sentence in sentences:
270
  sentence_words = len(sentence.split())
271
 
 
272
  if current_words + sentence_words > self.chunk_size and current_chunk:
273
  if current_words >= self.min_chunk_size:
274
- chunk = self._create_chunk(current_chunk.strip(), chunk_id, 1, "Section")
275
  chunks.append(chunk)
276
  chunk_id += 1
277
 
278
  # Start new chunk with overlap
279
- if chunks:
280
- # Take last 2 sentences as overlap
281
- last_sentences = current_chunk.split('.')[-3:-1]
282
- overlap_text = '. '.join(s.strip() for s in last_sentences if s.strip()) + '. '
283
- current_chunk = overlap_text + sentence
284
- current_words = len(current_chunk.split())
285
- else:
286
- current_chunk = sentence
287
- current_words = sentence_words
 
 
 
 
 
 
 
288
  else:
289
  if current_chunk:
290
  current_chunk += " " + sentence
@@ -292,56 +255,18 @@ class EnhancedChunker:
292
  current_chunk = sentence
293
  current_words += sentence_words
294
 
295
- # Add final chunk
296
- if current_chunk.strip() and current_words >= self.min_chunk_size:
297
- chunk = self._create_chunk(current_chunk.strip(), chunk_id, 1, "Section")
298
- chunks.append(chunk)
299
-
300
- return chunks
301
-
302
- def _chunk_by_paragraphs(self, text: str, start_chunk_id: int) -> List[DocumentChunk]:
303
- """Fallback chunking by paragraphs"""
304
- chunks = []
305
- chunk_id = start_chunk_id
306
-
307
- paragraphs = re.split(r'\n\s*\n|\. {2,}', text)
308
- paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 30]
309
-
310
- current_chunk = ""
311
- current_words = 0
312
-
313
- for para in paragraphs:
314
- para_words = len(para.split())
315
-
316
- if current_words + para_words > self.chunk_size and current_chunk:
317
- if current_words >= self.min_chunk_size:
318
- chunk = self._create_chunk(current_chunk.strip(), chunk_id, 1, "Document")
319
- chunks.append(chunk)
320
- chunk_id += 1
321
-
322
- # Add overlap
323
- if chunks:
324
- sentences = re.split(r'[.!?]+\s+', current_chunk)
325
- overlap_sentences = sentences[-2:] if len(sentences) >= 2 else sentences
326
- overlap_text = '. '.join(overlap_sentences)
327
- current_chunk = overlap_text + " " + para
328
- current_words = len(current_chunk.split())
329
- else:
330
- current_chunk = para
331
- current_words = para_words
332
- else:
333
- current_chunk += " " + para if current_chunk else para
334
- current_words += para_words
335
-
336
  # Add final chunk
337
  if current_chunk.strip() and current_words >= self.min_chunk_size:
338
  chunk = self._create_chunk(current_chunk.strip(), chunk_id, 1, "Document")
339
  chunks.append(chunk)
340
 
341
- # Ensure we have at least one chunk
342
- if not chunks and text.strip():
343
- chunk = self._create_chunk(text.strip(), 0, 1, "Document")
 
 
344
  chunks.append(chunk)
 
345
 
346
  return chunks
347
 
@@ -363,44 +288,32 @@ class EnhancedChunker:
363
  score = 1.0
364
  text_lower = text.lower()
365
 
366
- # Generic insurance terms (not hardcoded to specific company)
367
- insurance_terms = [
368
- 'premium', 'deductible', 'coverage', 'claim', 'policy', 'waiting period',
369
- 'grace period', 'maternity', 'pre-existing', 'sum insured', 'benefit',
370
- 'exclusion', 'inclusion', 'hospital', 'treatment', 'medical', 'health',
371
- 'co-payment', 'copayment', 'cashless', 'reimbursement', 'network'
372
  ]
373
 
374
- # Financial/numerical terms
375
- financial_terms = [
376
- 'amount', 'cost', 'fee', 'charge', 'limit', 'maximum', 'minimum',
377
- 'percentage', 'rate', 'liability', 'compensation', 'rupees', 'rs'
378
  ]
379
 
380
- # Time-related terms
381
- time_terms = ['days', 'months', 'years', 'duration', 'period', 'term', 'validity']
382
-
383
- # Action/requirement terms
384
- action_terms = ['shall', 'will', 'must', 'required', 'mandatory', 'provided', 'covered']
385
-
386
  # Calculate scores
 
387
  insurance_count = sum(1 for term in insurance_terms if term in text_lower)
388
- financial_count = sum(1 for term in financial_terms if term in text_lower)
389
- time_count = sum(1 for term in time_terms if term in text_lower)
390
- action_count = sum(1 for term in action_terms if term in text_lower)
391
 
392
- score += insurance_count * 0.3
393
- score += financial_count * 0.2
394
- score += time_count * 0.2
395
- score += action_count * 0.15
396
 
397
  # Boost for numerical information
398
  if re.search(r'\d+\s*(days?|months?|years?)', text_lower):
399
  score += 0.4
400
- if re.search(r'rs\.?\s*\d+|\d+%', text_lower):
401
- score += 0.4
402
- if re.search(r'\d+\s*(lakh|crore)', text_lower):
403
- score += 0.3
404
 
405
  return min(score, 5.0)
406
 
@@ -414,12 +327,16 @@ class EnhancedQASystem:
414
  self.initialize_models()
415
 
416
  def initialize_models(self):
417
- """Initialize CPU-friendly model"""
418
- model_name = "Qwen/Qwen2.5-1.5B-Instruct"
419
- logger.info(f"Loading model: {model_name}")
420
  try:
 
421
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
422
 
 
 
 
 
423
  self.model = AutoModelForCausalLM.from_pretrained(
424
  model_name,
425
  torch_dtype=torch.float32,
@@ -427,72 +344,96 @@ class EnhancedQASystem:
427
  low_cpu_mem_usage=True
428
  )
429
 
430
- self.qa_pipeline = pipeline(
431
- "text-generation",
432
- model=self.model,
433
- tokenizer=self.tokenizer,
434
- device=-1,
435
- max_new_tokens=50,
436
- max_length=1200,
437
- return_full_text=False,
438
- do_sample=False,
439
- temperature=0.1,
440
- pad_token_id=self.tokenizer.eos_token_id,
441
- eos_token_id=self.tokenizer.eos_token_id,
442
- repetition_penalty=1.2
443
- )
444
-
445
  logger.info(f"Model loaded successfully: {model_name}")
446
 
447
  except Exception as e:
448
- logger.error(f"Failed to load model: {e}")
449
- raise RuntimeError(f"Model loading failed: {str(e)}")
 
 
 
450
 
451
  def generate_answer(self, question: str, context: str, top_chunks: List[DocumentChunk]) -> Dict[str, Any]:
452
  """Generate answer with comprehensive context analysis"""
453
  start_time = time.time()
454
  try:
455
- # First try pattern-based extraction
 
 
 
456
  direct_answer = self._extract_comprehensive_answer(question, context)
457
- if direct_answer:
 
458
  return {
459
  'answer': direct_answer,
460
  'confidence': 0.95,
461
- 'reasoning': "Direct extraction from document content",
462
  'processing_time': time.time() - start_time,
463
  'source_chunks': len(top_chunks)
464
  }
465
 
466
- # Enhanced prompt for better context understanding
467
- prompt = f"""You are an insurance document analyzer. Based on the given context, provide a precise, direct answer to the question. Focus on extracting exact information from the context.
468
-
469
- Context from insurance document:
470
- {context[:900]}
471
-
472
- Question: {question}
473
-
474
- Provide a clear, specific answer based only on the information in the context. If the information is not available, say so.
475
-
476
- Answer:"""
477
-
478
- result = self.qa_pipeline(
479
- prompt,
480
- max_new_tokens=40,
481
- do_sample=False,
482
- temperature=0.1
483
- )[0]['generated_text'].strip()
484
 
485
- if not result:
486
- result = "Information not available in the document."
487
- else:
488
- result = self._clean_and_validate_answer(result, context)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
 
490
- confidence = 0.8 if "not available" not in result.lower() else 0.3
 
 
 
 
 
 
 
 
 
491
 
492
  return {
493
- 'answer': result,
494
- 'confidence': confidence,
495
- 'reasoning': "Generated from document analysis",
496
  'processing_time': time.time() - start_time,
497
  'source_chunks': len(top_chunks)
498
  }
@@ -508,128 +449,119 @@ Answer:"""
508
  }
509
 
510
  def _extract_comprehensive_answer(self, question: str, context: str) -> Optional[str]:
511
- """Comprehensive pattern-based answer extraction"""
512
  question_lower = question.lower()
513
  context_lower = context.lower()
514
 
515
- # Grace period patterns
 
 
516
  if 'grace period' in question_lower:
517
  patterns = [
518
  r'grace period[^.]*?(\d+)\s*days?',
519
  r'(\d+)\s*days?[^.]*?grace period',
520
  r'premium.*?(\d+)\s*days?.*?grace',
521
- r'thirty\s*days?[^.]*?grace',
522
- r'grace[^.]*?thirty\s*days?',
523
- r'(\d+)\s*days?.*?grace.*?period'
 
 
524
  ]
525
 
526
- # Check for "thirty" spelled out
527
- if any(word in context_lower for word in ['thirty', '30']) and 'days' in context_lower and 'grace' in context_lower:
528
- return "The grace period is 30 days for premium payment."
 
529
 
530
  for pattern in patterns:
531
  match = re.search(pattern, context_lower)
532
- if match and match.groups():
533
- days = match.group(1)
534
- return f"The grace period is {days} days for premium payment."
 
 
535
 
536
- # Waiting period patterns
537
  if 'waiting period' in question_lower:
538
- # Pre-existing disease waiting period
539
- if any(term in question_lower for term in ['ped', 'pre-existing', 'disease']):
540
- patterns = [
541
- r'pre.?existing[^.]*?(\d+)\s*months?[^.]*?waiting',
542
- r'waiting[^.]*?(\d+)\s*months?[^.]*?pre.?existing',
543
- r'(\d+)\s*months?[^.]*?pre.?existing[^.]*?disease'
544
- ]
545
- for pattern in patterns:
546
- match = re.search(pattern, context_lower)
547
- if match:
548
- months = match.group(1)
549
- return f"Pre-existing diseases have a {months}-month waiting period."
550
-
551
- # General waiting period
552
  patterns = [
553
  r'waiting period[^.]*?(\d+)\s*(days?|months?)',
554
  r'(\d+)\s*(days?|months?)[^.]*?waiting period',
555
  r'wait.*?(\d+)\s*(days?|months?)',
556
- r'(\d+)\s*(months?|days?)[^.]*?wait'
 
557
  ]
 
558
  for pattern in patterns:
559
  match = re.search(pattern, context_lower)
560
- if match:
561
- number, unit = match.groups()
562
- return f"The waiting period is {number} {unit}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
 
564
  # Maternity coverage
565
  if 'maternity' in question_lower:
566
- if any(num in context_lower for num in ['24', 'twenty-four', 'twenty four']):
567
- if 'months' in context_lower:
568
- return "Maternity coverage requires 24 months of continuous coverage."
569
- if re.search(r'maternity[^.]*?covered', context_lower):
570
- return "Yes, maternity is covered under the policy."
571
- if re.search(r'maternity[^.]*?(not covered|excluded)', context_lower):
572
- return "No, maternity is not covered under the policy."
573
-
574
- # Room rent limits
575
- if 'room rent' in question_lower or 'room charges' in question_lower:
576
- patterns = [
577
- r'room rent[^.]*?(\d+)%',
578
- r'(\d+)%[^.]*?room rent',
579
- r'room charges[^.]*?(\d+)%',
580
- r'accommodation[^.]*?(\d+)%',
581
- r'(\d+)%[^.]*?sum insured[^.]*?room'
582
- ]
583
- for pattern in patterns:
584
- match = re.search(pattern, context_lower)
585
- if match:
586
- percentage = match.group(1)
587
- return f"Room rent is limited to {percentage}% of sum insured."
588
 
589
- # Co-payment
590
- if 'co-payment' in question_lower or 'copayment' in question_lower:
591
- patterns = [
592
- r'co.?payment[^.]*?(\d+)%',
593
- r'(\d+)%[^.]*?co.?payment',
594
- r'patient[^.]*?bear[^.]*?(\d+)%',
595
- r'insured[^.]*?pay[^.]*?(\d+)%'
596
- ]
597
- for pattern in patterns:
598
- match = re.search(pattern, context_lower)
599
- if match:
600
- percentage = match.group(1)
601
- return f"Co-payment is {percentage}% of the claim amount."
602
 
603
- # Sum insured/Coverage amount
604
- if any(term in question_lower for term in ['sum insured', 'coverage amount', 'maximum coverage', 'policy amount']):
605
- patterns = [
606
- r'sum insured[^.]*?rs\.?\s*(\d+(?:,\d+)*(?:\s*lakh)?)',
607
- r'rs\.?\s*(\d+(?:,\d+)*(?:\s*lakh)?)[^.]*?sum insured',
608
- r'coverage[^.]*?rs\.?\s*(\d+(?:,\d+)*(?:\s*lakh)?)',
609
- r'maximum.*?benefit.*?rs\.?\s*(\d+(?:,\d+)*(?:\s*lakh)?)',
610
- r'policy.*?amount.*?rs\.?\s*(\d+(?:,\d+)*(?:\s*lakh)?)'
611
- ]
612
- for pattern in patterns:
613
- match = re.search(pattern, context_lower)
614
- if match:
615
- amount = match.group(1)
616
- return f"The sum insured/coverage amount is Rs. {amount}."
617
 
618
- # Age limits
619
- if 'age' in question_lower and any(term in question_lower for term in ['limit', 'maximum', 'minimum', 'entry']):
620
- patterns = [
621
- r'age[^.]*?(\d+)\s*years?[^.]*?(maximum|minimum|limit)',
622
- r'(maximum|minimum)[^.]*?age[^.]*?(\d+)\s*years?',
623
- r'entry[^.]*?age[^.]*?(\d+)\s*years?'
624
- ]
625
- for pattern in patterns:
626
- match = re.search(pattern, context_lower)
627
- if match:
628
- groups = match.groups()
629
- if len(groups) >= 2:
630
- age = groups[0] if groups[0].isdigit() else groups[1]
631
- limit_type = groups[1] if groups[0].isdigit() else groups[0]
632
- return f"The {limit_type} age limit is {age} years."
 
 
 
 
 
 
 
 
 
633
 
634
  return None
635
 
@@ -638,73 +570,19 @@ Answer:"""
638
  if not text:
639
  return "Information not available in the document."
640
 
641
- # Remove unwanted patterns
642
  text = re.sub(r'\n+', ' ', text)
643
  text = re.sub(r'\s+', ' ', text)
644
- text = re.sub(r'\[.*?\]', '', text)
645
- text = re.sub(r'Based on.*?[,:]', '', text, flags=re.IGNORECASE)
646
- text = re.sub(r'According to.*?[,:]', '', text, flags=re.IGNORECASE)
647
- text = re.sub(r'Answer:\s*', '', text, flags=re.IGNORECASE)
648
-
649
- # Remove repetitive content
650
- sentences = text.split('.')
651
- unique_sentences = []
652
- seen = set()
653
-
654
- for sentence in sentences:
655
- sentence = sentence.strip()
656
- if sentence and sentence not in seen and len(sentence) > 10:
657
- seen.add(sentence)
658
- unique_sentences.append(sentence)
659
-
660
- # Take first 2 sentences max
661
- text = '. '.join(unique_sentences[:2])
662
-
663
- # Ensure proper ending
664
- if text and not text.endswith(('.', '!', '?')):
665
- text += '.'
666
-
667
- # Validate against context
668
- if not self._validate_answer_against_context(text, context):
669
- return "Information not available in the document."
670
-
671
- return text.strip()
672
-
673
- def _validate_answer_against_context(self, answer: str, context: str) -> bool:
674
- """Validate that the answer is grounded in the context"""
675
- if not answer or "not available" in answer.lower():
676
- return True
677
-
678
- answer_lower = answer.lower()
679
- context_lower = context.lower()
680
-
681
- # Extract key numbers from answer
682
- answer_numbers = re.findall(r'\d+', answer_lower)
683
 
684
- # Check if key numbers exist in context
685
- for number in answer_numbers:
686
- if number not in context_lower:
687
- return False
 
 
688
 
689
- # Check key terms overlap
690
- answer_words = set(re.findall(r'\b\w+\b', answer_lower))
691
- context_words = set(re.findall(r'\b\w+\b', context_lower))
692
-
693
- # Remove common words
694
- common_words = {'the', 'is', 'are', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
695
- 'of', 'with', 'by', 'from', 'as', 'be', 'have', 'has', 'will', 'this', 'that'}
696
-
697
- meaningful_answer_words = answer_words - common_words
698
- meaningful_context_words = context_words - common_words
699
-
700
- if not meaningful_answer_words:
701
- return True
702
-
703
- # Check overlap ratio
704
- overlap = meaningful_answer_words.intersection(meaningful_context_words)
705
- overlap_ratio = len(overlap) / len(meaningful_answer_words)
706
-
707
- return overlap_ratio >= 0.6 # At least 60% of meaningful words should be in context
708
 
709
  class EnhancedSingleDocumentSystem:
710
  """Enhanced system optimized for single document processing"""
@@ -721,14 +599,20 @@ class EnhancedSingleDocumentSystem:
721
  self.initialize_embeddings()
722
 
723
  def initialize_embeddings(self):
724
- """Initialize embedding model"""
725
  try:
726
  self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
727
- self.embedding_model.max_seq_length = 384
728
  logger.info("Embedding model loaded: all-MiniLM-L6-v2")
729
  except Exception as e:
730
  logger.error(f"Embedding model error: {e}")
731
- raise RuntimeError(f"Embedding model failed to load: {str(e)}")
 
 
 
 
 
 
732
 
733
  def process_document_optimized(self, url: str) -> Dict[str, Any]:
734
  """Process single document with comprehensive analysis"""
@@ -742,8 +626,12 @@ class EnhancedSingleDocumentSystem:
742
  if not response:
743
  return {'success': False, 'error': f'Failed to download document from {url}'}
744
 
 
 
745
  # Determine document type and extract
746
  content_type = response.headers.get('content-type', '').lower()
 
 
747
  if 'pdf' in content_type or url.lower().endswith('.pdf'):
748
  structured_content = self.doc_processor.extract_pdf_optimized(response.content, url)
749
  elif 'docx' in content_type or url.lower().endswith('.docx'):
@@ -759,11 +647,15 @@ class EnhancedSingleDocumentSystem:
759
  'total_words': len(text_content.split()),
760
  'source_url': url
761
  }
 
762
  except Exception as e:
763
  return {'success': False, 'error': f'Unsupported document type or encoding error: {str(e)}'}
764
 
765
- if not structured_content.get('full_text'):
766
- return {'success': False, 'error': 'No text content could be extracted from the document'}
 
 
 
767
 
768
  # Create optimized chunks
769
  self.document_chunks = self.chunker.create_smart_chunks(structured_content)
@@ -775,9 +667,10 @@ class EnhancedSingleDocumentSystem:
775
  chunk_texts = [chunk.text for chunk in self.document_chunks]
776
 
777
  try:
 
778
  self.chunk_embeddings = self.embedding_model.encode(
779
  chunk_texts,
780
- batch_size=8,
781
  show_progress_bar=False,
782
  convert_to_numpy=True,
783
  normalize_embeddings=True
@@ -788,7 +681,10 @@ class EnhancedSingleDocumentSystem:
788
  self.index = faiss.IndexFlatIP(dimension)
789
  self.index.add(self.chunk_embeddings.astype('float32'))
790
 
 
 
791
  except Exception as e:
 
792
  return {'success': False, 'error': f'Embedding creation failed: {str(e)}'}
793
 
794
  self.document_processed = True
@@ -816,6 +712,7 @@ class EnhancedSingleDocumentSystem:
816
 
817
  for attempt in range(max_retries):
818
  try:
 
819
  response = requests.get(url, headers=headers, timeout=30, stream=True)
820
  response.raise_for_status()
821
  return response
@@ -826,17 +723,20 @@ class EnhancedSingleDocumentSystem:
826
 
827
  return None
828
 
829
- def semantic_search_optimized(self, query: str, top_k: int = 10) -> List[DocumentChunk]:
830
  """Enhanced semantic search with better relevance scoring"""
831
  if not self.index or not self.document_chunks or not self.document_processed:
 
832
  return []
833
 
834
  try:
 
 
835
  # Create query embedding
836
  query_embedding = self.embedding_model.encode([query], normalize_embeddings=True)
837
 
838
- # Search for more candidates than needed
839
- search_k = min(top_k * 3, len(self.document_chunks))
840
  scores, indices = self.index.search(query_embedding.astype('float32'), search_k)
841
 
842
  # Enhanced scoring with keyword matching
@@ -845,6 +745,7 @@ class EnhancedSingleDocumentSystem:
845
 
846
  # Define query-specific keywords for boosting
847
  query_keywords = self._extract_query_keywords(query_lower)
 
848
 
849
  for score, idx in zip(scores[0], indices[0]):
850
  if 0 <= idx < len(self.document_chunks):
@@ -856,33 +757,33 @@ class EnhancedSingleDocumentSystem:
856
 
857
  # Keyword matching boost
858
  keyword_matches = sum(1 for keyword in query_keywords if keyword in chunk_text_lower)
859
- boosted_score += keyword_matches * 0.2
860
 
861
  # Importance score boost
862
  boosted_score += chunk.importance_score * 0.1
863
 
864
  # Exact phrase matching boost
865
- if len(query_keywords) >= 2:
866
- query_phrases = [' '.join(query_keywords[i:i+2]) for i in range(len(query_keywords)-1)]
867
- phrase_matches = sum(1 for phrase in query_phrases if phrase in chunk_text_lower)
868
- boosted_score += phrase_matches * 0.3
869
 
870
  # Number/percentage matching boost
871
  query_numbers = re.findall(r'\d+', query_lower)
872
  chunk_numbers = re.findall(r'\d+', chunk_text_lower)
873
  number_matches = len(set(query_numbers).intersection(set(chunk_numbers)))
874
- boosted_score += number_matches * 0.15
875
 
 
876
  boosted_results.append((boosted_score, idx, chunk))
877
 
878
  # Sort by boosted score
879
  boosted_results.sort(key=lambda x: x[0], reverse=True)
880
 
881
- # Select top results with context windows
882
  top_chunks = []
883
- for _, idx, chunk in boosted_results[:top_k]:
884
- # Add context window to chunk
885
- chunk.context_window = self._get_context_window(idx)
886
  top_chunks.append(chunk)
887
 
888
  return top_chunks
@@ -894,7 +795,7 @@ class EnhancedSingleDocumentSystem:
894
  def _extract_query_keywords(self, query_lower: str) -> List[str]:
895
  """Extract relevant keywords from query for boosting"""
896
  # Remove common question words
897
- stop_words = {'what', 'is', 'are', 'the', 'a', 'an', 'how', 'when', 'where', 'why', 'which', 'who'}
898
 
899
  words = re.findall(r'\b\w+\b', query_lower)
900
  keywords = [word for word in words if word not in stop_words and len(word) > 2]
@@ -905,35 +806,14 @@ class EnhancedSingleDocumentSystem:
905
  compound_terms.append('grace period')
906
  if 'waiting' in keywords and 'period' in keywords:
907
  compound_terms.append('waiting period')
 
 
908
  if 'sum' in keywords and 'insured' in keywords:
909
  compound_terms.append('sum insured')
910
- if 'room' in keywords and 'rent' in keywords:
911
- compound_terms.append('room rent')
912
- if 'co' in keywords and 'payment' in keywords:
913
- compound_terms.append('co-payment')
914
 
915
  return keywords + compound_terms
916
 
917
- def _get_context_window(self, chunk_idx: int, window_size: int = 1) -> str:
918
- """Get context from surrounding chunks"""
919
- context_parts = []
920
-
921
- # Add previous chunk context
922
- if chunk_idx > 0:
923
- prev_chunk = self.document_chunks[chunk_idx - 1]
924
- context_parts.append(prev_chunk.text[-200:]) # Last 200 chars
925
-
926
- # Add current chunk
927
- context_parts.append(self.document_chunks[chunk_idx].text)
928
-
929
- # Add next chunk context
930
- if chunk_idx < len(self.document_chunks) - 1:
931
- next_chunk = self.document_chunks[chunk_idx + 1]
932
- context_parts.append(next_chunk.text[:200]) # First 200 chars
933
-
934
- return " ... ".join(context_parts)
935
-
936
- def _build_optimized_context(self, question: str, chunks: List[DocumentChunk], max_length: int = 1000) -> str:
937
  """Build optimized context from top chunks"""
938
  if not chunks:
939
  return ""
@@ -941,25 +821,27 @@ class EnhancedSingleDocumentSystem:
941
  context_parts = []
942
  current_length = 0
943
 
944
- # Sort chunks by importance and relevance
945
  sorted_chunks = sorted(chunks, key=lambda x: x.importance_score, reverse=True)
946
 
947
  for chunk in sorted_chunks:
948
- chunk_text = chunk.context_window if chunk.context_window else chunk.text
949
  chunk_length = len(chunk_text)
950
 
951
  if current_length + chunk_length <= max_length:
952
  context_parts.append(chunk_text)
953
  current_length += chunk_length
954
  else:
955
- # Add partial chunk if there's space
956
  remaining_space = max_length - current_length
957
- if remaining_space > 150: # Only if meaningful space left
958
  truncated = chunk_text[:remaining_space-3] + "..."
959
  context_parts.append(truncated)
960
  break
961
 
962
- return " ".join(context_parts)
 
 
963
 
964
  def process_single_query_optimized(self, question: str) -> Dict[str, Any]:
965
  """Process single query with enhanced accuracy"""
@@ -974,10 +856,13 @@ class EnhancedSingleDocumentSystem:
974
 
975
  start_time = time.time()
976
  try:
 
 
977
  # Get relevant chunks
978
- top_chunks = self.semantic_search_optimized(question, top_k=8)
979
 
980
  if not top_chunks:
 
981
  return {
982
  'answer': 'No relevant information found in the document for this question.',
983
  'confidence': 0.0,
@@ -989,11 +874,12 @@ class EnhancedSingleDocumentSystem:
989
  # Build comprehensive context
990
  context = self._build_optimized_context(question, top_chunks)
991
 
992
- # Log for debugging
993
- logger.info(f"Question: '{question[:50]}...' | Chunks: {len(top_chunks)} | Context length: {len(context)}")
994
 
995
  # Generate answer
996
  result = self.qa_system.generate_answer(question, context, top_chunks)
 
 
997
  return result
998
 
999
  except Exception as e:
@@ -1018,7 +904,7 @@ class EnhancedSingleDocumentSystem:
1018
  }
1019
 
1020
  for i, question in enumerate(questions):
1021
- logger.info(f"Processing question {i+1}/{len(questions)}: {question[:50]}...")
1022
  result = self.process_single_query_optimized(question)
1023
  answers.append(result['answer'])
1024
 
@@ -1057,10 +943,17 @@ def process_hackathon_submission(url_text, questions_text):
1057
  if not questions:
1058
  return "No valid questions found. Please provide questions as JSON array or one per line."
1059
 
 
 
 
1060
  # Process document
1061
  doc_result = enhanced_system.process_document_optimized(url)
1062
  if not doc_result.get("success"):
1063
- return f"Document processing failed: {doc_result.get('error')}"
 
 
 
 
1064
 
1065
  # Process questions
1066
  batch_result = enhanced_system.process_batch_queries_optimized(questions)
@@ -1088,10 +981,14 @@ def process_single_question(url_text, question):
1088
  if not url:
1089
  return "No valid URL found. Please provide a document URL."
1090
 
 
 
1091
  # Process document
1092
  doc_result = enhanced_system.process_document_optimized(url)
1093
  if not doc_result.get("success"):
1094
- return f"Document processing failed: {doc_result.get('error')}"
 
 
1095
 
1096
  # Process single question
1097
  result = enhanced_system.process_single_query_optimized(question)
@@ -1124,200 +1021,105 @@ def hackathon_wrapper(url_text, questions_text):
1124
  def single_query_wrapper(url_text, question):
1125
  return process_single_question(url_text, question)
1126
 
1127
- # Simplified Gradio Interface
1128
  with gr.Blocks(
1129
  theme=gr.themes.Soft(
1130
  primary_hue="blue",
1131
  secondary_hue="indigo",
1132
  neutral_hue="slate",
1133
- font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
1134
  ),
1135
- css="""
1136
- .gradio-container {
1137
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
1138
- min-height: 100vh;
1139
- }
1140
-
1141
- .main-content {
1142
- background: white;
1143
- border-radius: 15px;
1144
- box-shadow: 0 20px 40px rgba(0,0,0,0.1);
1145
- margin: 1rem;
1146
- overflow: hidden;
1147
- }
1148
-
1149
- .app-header {
1150
- text-align: center;
1151
- padding: 2rem;
1152
- background: linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%);
1153
- color: white;
1154
- }
1155
-
1156
- .app-header h1 {
1157
- font-size: 2.5rem;
1158
- font-weight: 800;
1159
- margin-bottom: 0.5rem;
1160
- text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
1161
- }
1162
-
1163
- .app-header p {
1164
- font-size: 1.1rem;
1165
- opacity: 0.9;
1166
- font-weight: 500;
1167
- }
1168
-
1169
- .content-section {
1170
- padding: 2rem;
1171
- }
1172
-
1173
- .section-title {
1174
- color: #4f46e5;
1175
- font-size: 1.4rem;
1176
- font-weight: 700;
1177
- margin-bottom: 1rem;
1178
- }
1179
-
1180
- .gr-button {
1181
- border-radius: 8px !important;
1182
- font-weight: 600 !important;
1183
- transition: all 0.3s ease !important;
1184
- }
1185
-
1186
- .gr-button:hover {
1187
- transform: translateY(-2px) !important;
1188
- }
1189
-
1190
- .gr-textbox textarea, .gr-textbox input {
1191
- border-radius: 8px !important;
1192
- border: 2px solid #e2e8f0 !important;
1193
- }
1194
-
1195
- .gr-textbox textarea:focus, .gr-textbox input:focus {
1196
- border-color: #4f46e5 !important;
1197
- }
1198
- """
1199
  ) as demo:
1200
 
1201
- with gr.Column(elem_classes="main-content"):
1202
-
1203
- gr.HTML("""
1204
- <div class="app-header">
1205
- <h1>🎯 Single Document QA System</h1>
1206
- <p>Optimized for Accurate Insurance Document Analysis</p>
1207
- </div>
1208
- """)
1209
 
 
 
 
1210
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1211
 
1212
- with gr.Column(scale=1, elem_classes="content-section"):
1213
- with gr.Tabs():
1214
-
1215
- with gr.Tab("🚀 Hackathon Mode", id=0):
1216
- gr.HTML('<h3 class="section-title">📄 Document Analysis</h3>')
1217
-
1218
- hack_url = gr.Textbox(
1219
- label="📄 Document URL",
1220
- placeholder="https://example.com/insurance-policy.pdf",
1221
- lines=2,
1222
- info="Enter single document URL (PDF or DOCX format)"
1223
- )
1224
-
1225
- hack_questions = gr.Textbox(
1226
- label="❓ Questions",
1227
- placeholder='["What is the grace period?", "Is maternity covered?"]',
1228
- lines=6,
1229
- info="Enter questions as JSON array or one per line"
1230
- )
1231
-
1232
- with gr.Row():
1233
- hack_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
1234
- hack_submit_btn = gr.Button("🚀 Process Questions", variant="primary")
1235
-
1236
- with gr.Tab("🔍 Single Query", id=1):
1237
- gr.HTML('<h3 class="section-title">🔍 Detailed Analysis</h3>')
1238
-
1239
- single_url = gr.Textbox(
1240
- label="📄 Document URL",
1241
- placeholder="https://example.com/insurance-policy.pdf",
1242
- lines=2,
1243
- info="Enter document URL for analysis"
1244
- )
1245
-
1246
- single_question = gr.Textbox(
1247
- label="❓ Your Question",
1248
- placeholder="What is the waiting period for pre-existing diseases?",
1249
- lines=3,
1250
- info="Ask a specific question about the document"
1251
- )
1252
-
1253
- with gr.Row():
1254
- single_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
1255
- single_submit_btn = gr.Button("🔍 Get Answer", variant="primary")
1256
 
1257
- with gr.Column(scale=2, elem_classes="content-section"):
1258
- gr.HTML('<h3 class="section-title">📊 Results</h3>')
 
 
 
 
 
 
 
 
1259
 
1260
- with gr.Tabs():
1261
- with gr.Tab(" Hackathon Results", id=2):
1262
- hack_output = gr.Textbox(
1263
- label="📊 JSON Response",
1264
- lines=25,
1265
- interactive=False,
1266
- show_copy_button=True
1267
- )
1268
-
1269
- with gr.Tab("🔍 Detailed Results", id=3):
1270
- single_output = gr.Textbox(
1271
- label="📋 Comprehensive Response",
1272
- lines=25,
1273
- interactive=False,
1274
- show_copy_button=True
1275
- )
1276
-
1277
- # Event handlers
1278
- hack_submit_btn.click(
1279
- fn=hackathon_wrapper,
1280
- inputs=[hack_url, hack_questions],
1281
- outputs=[hack_output],
1282
- concurrency_limit=4
1283
- )
1284
-
1285
- hack_clear_btn.click(
1286
- lambda: (None, None, None),
1287
- outputs=[hack_url, hack_questions, hack_output]
1288
- )
1289
-
1290
- single_submit_btn.click(
1291
- fn=single_query_wrapper,
1292
- inputs=[single_url, single_question],
1293
- outputs=[single_output],
1294
- concurrency_limit=4
1295
- )
1296
-
1297
- single_clear_btn.click(
1298
- lambda: (None, None, None),
1299
- outputs=[single_url, single_question, single_output]
1300
- )
1301
 
1302
  # Configure for deployment
1303
- demo.queue(max_size=20)
1304
 
1305
- # Mount Gradio on FastAPI. This `app` object is what we will run.
1306
  app = gr.mount_gradio_app(api_app, demo, path="/")
1307
 
1308
- # Use this block to run the app correctly with Uvicorn
1309
  if __name__ == "__main__":
1310
- print("Starting server with Uvicorn...")
1311
-
1312
- # Read the ROOT_PATH from an environment variable.
1313
- # Default to "/" if the variable is not set (for local testing).
1314
- root_path = os.getenv("ROOT_PATH", "/")
1315
 
1316
- print(f"Using root_path: {root_path}") # Add a log to see what's being used
1317
-
 
 
1318
  uvicorn.run(
1319
  app,
1320
- host="0.0.0.0",
1321
- port=7860,
1322
- root_path=root_path # <-- Use the dynamically determined root_path
1323
  )
 
38
  ):
39
  try:
40
  data = await request.json()
41
+ documents = data.get("documents")
42
  questions = data.get("questions")
43
 
44
  if not documents or not questions:
 
49
 
50
  # Handle single document URL
51
  if isinstance(documents, list):
52
+ document_url = documents[0]
53
  else:
54
  document_url = documents
55
 
 
65
  return JSONResponse(content={"answers": answers}, status_code=200)
66
 
67
  except Exception as e:
68
+ logger.error(f"API Error: {str(e)}")
69
  return JSONResponse(content={"error": str(e)}, status_code=500)
70
 
71
  @dataclass
 
107
  page_text = page.extract_text()
108
  if page_text:
109
  cleaned_text = self._clean_text_comprehensive(page_text)
110
+ if len(cleaned_text.strip()) > 30: # Reduced minimum length
111
  pages_content.append({
112
  'page_num': page_num + 1,
113
  'text': cleaned_text,
 
126
  'source_url': source_url
127
  }
128
 
129
+ # Cache management
130
  if len(self.cache) >= self.max_cache_size:
131
  self.cache.pop(next(iter(self.cache)))
132
  self.cache[cache_key] = result
133
 
134
+ logger.info(f"PDF extracted: {len(pages_content)} pages, {len(all_text.split())} words")
135
  return result
136
 
137
  except Exception as e:
 
148
  for para in doc.paragraphs:
149
  if para.text.strip():
150
  cleaned_text = self._clean_text_comprehensive(para.text)
151
+ if len(cleaned_text.strip()) > 10: # Reduced minimum length
152
  paragraphs.append(cleaned_text)
153
  full_text += " " + cleaned_text
154
 
155
+ result = {
156
  'pages': [{'page_num': 1, 'text': full_text, 'word_count': len(full_text.split())}],
157
  'full_text': full_text.strip(),
158
  'total_pages': 1,
 
161
  'source_url': source_url
162
  }
163
 
164
+ logger.info(f"DOCX extracted: {len(paragraphs)} paragraphs, {len(full_text.split())} words")
165
+ return result
166
+
167
  except Exception as e:
168
  logger.error(f"DOCX extraction error: {e}")
169
  return {'pages': [], 'full_text': '', 'total_pages': 0, 'total_words': 0, 'source_url': source_url}
 
173
  if not text:
174
  return ""
175
 
176
+ # Basic cleaning - preserve more content
177
  text = re.sub(r'\s+', ' ', text.strip())
178
 
179
  # Fix spacing around punctuation
180
  text = re.sub(r'\s+([.,:;!?])', r'\1', text)
181
  text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
182
 
183
+ # Preserve insurance terminology - be more conservative
 
 
 
 
184
  text = re.sub(r'(\d+)\s*months?', r'\1 months', text, flags=re.IGNORECASE)
185
  text = re.sub(r'(\d+)\s*days?', r'\1 days', text, flags=re.IGNORECASE)
186
  text = re.sub(r'(\d+)\s*years?', r'\1 years', text, flags=re.IGNORECASE)
 
 
 
 
 
 
187
 
188
+ # Fix common insurance terms
189
+ text = re.sub(r'Rs\.?\s*(\d+)', r'Rs. \1', text, flags=re.IGNORECASE)
190
+ text = re.sub(r'grace\s+period', 'grace period', text, flags=re.IGNORECASE)
191
+ text = re.sub(r'waiting\s+period', 'waiting period', text, flags=re.IGNORECASE)
192
 
193
  return text.strip()
194
 
195
  class EnhancedChunker:
196
  """Enhanced chunking with better context preservation"""
197
 
198
+ def __init__(self, chunk_size: int = 300, overlap: int = 75, min_chunk_size: int = 80): # Smaller chunks for better precision
199
  self.chunk_size = chunk_size
200
  self.overlap = overlap
201
  self.min_chunk_size = min_chunk_size
 
210
  if not full_text:
211
  return chunks
212
 
213
+ logger.info(f"Creating chunks from text of length: {len(full_text)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
+ # Split by sentences first for better coherence
216
+ sentences = re.split(r'(?<=[.!?])\s+', full_text)
217
+ sentences = [s.strip() for s in sentences if s.strip()]
 
 
 
218
 
219
+ logger.info(f"Split into {len(sentences)} sentences")
 
 
220
 
221
  current_chunk = ""
222
  current_words = 0
223
 
224
+ for i, sentence in enumerate(sentences):
225
  sentence_words = len(sentence.split())
226
 
227
+ # If adding this sentence would exceed chunk size and we have content
228
  if current_words + sentence_words > self.chunk_size and current_chunk:
229
  if current_words >= self.min_chunk_size:
230
+ chunk = self._create_chunk(current_chunk.strip(), chunk_id, 1, "Document")
231
  chunks.append(chunk)
232
  chunk_id += 1
233
 
234
  # Start new chunk with overlap
235
+ overlap_sentences = []
236
+ temp_words = 0
237
+ j = 0
238
+ while j < min(3, len(sentences) - i) and temp_words < self.overlap:
239
+ if i - j - 1 >= 0:
240
+ prev_sentence = sentences[i - j - 1]
241
+ sentence_len = len(prev_sentence.split())
242
+ if temp_words + sentence_len <= self.overlap:
243
+ overlap_sentences.insert(0, prev_sentence)
244
+ temp_words += sentence_len
245
+ j += 1
246
+ else:
247
+ break
248
+
249
+ current_chunk = " ".join(overlap_sentences) + " " + sentence if overlap_sentences else sentence
250
+ current_words = len(current_chunk.split())
251
  else:
252
  if current_chunk:
253
  current_chunk += " " + sentence
 
255
  current_chunk = sentence
256
  current_words += sentence_words
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  # Add final chunk
259
  if current_chunk.strip() and current_words >= self.min_chunk_size:
260
  chunk = self._create_chunk(current_chunk.strip(), chunk_id, 1, "Document")
261
  chunks.append(chunk)
262
 
263
+ logger.info(f"Created {len(chunks)} chunks")
264
+
265
+ # If no chunks created, create one from full text
266
+ if not chunks and full_text.strip():
267
+ chunk = self._create_chunk(full_text.strip(), 0, 1, "Document")
268
  chunks.append(chunk)
269
+ logger.info("Created fallback chunk from full text")
270
 
271
  return chunks
272
 
 
288
  score = 1.0
289
  text_lower = text.lower()
290
 
291
+ # Enhanced keyword matching for insurance documents
292
+ high_value_terms = [
293
+ 'grace period', 'waiting period', 'premium payment', 'sum insured',
294
+ 'coverage amount', 'maternity', 'co-payment', 'deductible', 'exclusion',
295
+ 'benefit', 'claim', 'policy', 'thirty days', '30 days', 'months', 'years'
 
296
  ]
297
 
298
+ insurance_terms = [
299
+ 'premium', 'coverage', 'policy', 'benefit', 'exclusion', 'inclusion',
300
+ 'hospital', 'treatment', 'medical', 'health', 'cashless', 'reimbursement'
 
301
  ]
302
 
 
 
 
 
 
 
303
  # Calculate scores
304
+ high_value_count = sum(1 for term in high_value_terms if term in text_lower)
305
  insurance_count = sum(1 for term in insurance_terms if term in text_lower)
 
 
 
306
 
307
+ score += high_value_count * 0.5
308
+ score += insurance_count * 0.2
 
 
309
 
310
  # Boost for numerical information
311
  if re.search(r'\d+\s*(days?|months?|years?)', text_lower):
312
  score += 0.4
313
+ if re.search(r'grace\s*period', text_lower):
314
+ score += 0.6
315
+ if re.search(r'waiting\s*period', text_lower):
316
+ score += 0.5
317
 
318
  return min(score, 5.0)
319
 
 
327
  self.initialize_models()
328
 
329
  def initialize_models(self):
330
+ """Initialize CPU-friendly model with better error handling"""
331
+ model_name = "microsoft/DialoGPT-medium" # More reliable alternative
 
332
  try:
333
+ logger.info(f"Loading model: {model_name}")
334
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
335
 
336
+ # Add padding token if missing
337
+ if self.tokenizer.pad_token is None:
338
+ self.tokenizer.pad_token = self.tokenizer.eos_token
339
+
340
  self.model = AutoModelForCausalLM.from_pretrained(
341
  model_name,
342
  torch_dtype=torch.float32,
 
344
  low_cpu_mem_usage=True
345
  )
346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  logger.info(f"Model loaded successfully: {model_name}")
348
 
349
  except Exception as e:
350
+ logger.error(f"Failed to load primary model, using fallback: {e}")
351
+ # Fallback to pattern-based approach only
352
+ self.tokenizer = None
353
+ self.model = None
354
+ self.qa_pipeline = None
355
 
356
  def generate_answer(self, question: str, context: str, top_chunks: List[DocumentChunk]) -> Dict[str, Any]:
357
  """Generate answer with comprehensive context analysis"""
358
  start_time = time.time()
359
  try:
360
+ logger.info(f"Processing question: {question[:50]}...")
361
+ logger.info(f"Context length: {len(context)}")
362
+
363
+ # First try enhanced pattern-based extraction
364
  direct_answer = self._extract_comprehensive_answer(question, context)
365
+ if direct_answer and direct_answer != "Information not available in the document.":
366
+ logger.info(f"Pattern-based answer found: {direct_answer[:50]}...")
367
  return {
368
  'answer': direct_answer,
369
  'confidence': 0.95,
370
+ 'reasoning': "Pattern-based extraction from document content",
371
  'processing_time': time.time() - start_time,
372
  'source_chunks': len(top_chunks)
373
  }
374
 
375
+ # Enhanced fuzzy matching for common questions
376
+ fuzzy_answer = self._fuzzy_answer_extraction(question, context)
377
+ if fuzzy_answer:
378
+ logger.info(f"Fuzzy answer found: {fuzzy_answer[:50]}...")
379
+ return {
380
+ 'answer': fuzzy_answer,
381
+ 'confidence': 0.85,
382
+ 'reasoning': "Fuzzy pattern matching from document content",
383
+ 'processing_time': time.time() - start_time,
384
+ 'source_chunks': len(top_chunks)
385
+ }
 
 
 
 
 
 
 
386
 
387
+ # If no pattern match, try model generation (if available)
388
+ if self.model and self.tokenizer:
389
+ try:
390
+ # Simple prompt for better results
391
+ prompt = f"Question: {question}\nContext: {context[:500]}\nAnswer:"
392
+
393
+ inputs = self.tokenizer.encode(prompt, return_tensors='pt', max_length=512, truncation=True)
394
+
395
+ with torch.no_grad():
396
+ outputs = self.model.generate(
397
+ inputs,
398
+ max_new_tokens=30,
399
+ num_return_sequences=1,
400
+ temperature=0.7,
401
+ do_sample=True,
402
+ pad_token_id=self.tokenizer.eos_token_id
403
+ )
404
+
405
+ result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
406
+ result = result.replace(prompt, "").strip()
407
+
408
+ if result and len(result) > 5:
409
+ result = self._clean_and_validate_answer(result, context)
410
+ if result != "Information not available in the document.":
411
+ return {
412
+ 'answer': result,
413
+ 'confidence': 0.7,
414
+ 'reasoning': "Generated from model analysis",
415
+ 'processing_time': time.time() - start_time,
416
+ 'source_chunks': len(top_chunks)
417
+ }
418
+
419
+ except Exception as e:
420
+ logger.error(f"Model generation error: {e}")
421
 
422
+ # Final fallback - context search
423
+ context_answer = self._context_search_answer(question, context)
424
+ if context_answer:
425
+ return {
426
+ 'answer': context_answer,
427
+ 'confidence': 0.6,
428
+ 'reasoning': "Context-based search result",
429
+ 'processing_time': time.time() - start_time,
430
+ 'source_chunks': len(top_chunks)
431
+ }
432
 
433
  return {
434
+ 'answer': "Information not available in the document.",
435
+ 'confidence': 0.0,
436
+ 'reasoning': "No relevant information found in document",
437
  'processing_time': time.time() - start_time,
438
  'source_chunks': len(top_chunks)
439
  }
 
449
  }
450
 
451
  def _extract_comprehensive_answer(self, question: str, context: str) -> Optional[str]:
452
+ """Comprehensive pattern-based answer extraction with enhanced patterns"""
453
  question_lower = question.lower()
454
  context_lower = context.lower()
455
 
456
+ logger.info(f"Pattern extraction for: {question_lower}")
457
+
458
+ # Enhanced Grace period patterns
459
  if 'grace period' in question_lower:
460
  patterns = [
461
  r'grace period[^.]*?(\d+)\s*days?',
462
  r'(\d+)\s*days?[^.]*?grace period',
463
  r'premium.*?(\d+)\s*days?.*?grace',
464
+ r'grace[^.]*?(\d+)\s*days?',
465
+ r'(\d+)\s*days?.*?premium.*?payment.*?grace',
466
+ r'payment.*?grace.*?(\d+)\s*days?',
467
+ r'thirty\s*\(?30\)?\s*days?.*?grace',
468
+ r'grace.*?thirty\s*\(?30\)?\s*days?'
469
  ]
470
 
471
+ # Check for common insurance grace periods
472
+ if any(word in context_lower for word in ['thirty', '30']) and 'days' in context_lower:
473
+ if 'grace' in context_lower and 'period' in context_lower:
474
+ return "The grace period is 30 days for premium payment."
475
 
476
  for pattern in patterns:
477
  match = re.search(pattern, context_lower)
478
+ if match:
479
+ groups = match.groups()
480
+ for group in groups:
481
+ if group and group.isdigit():
482
+ return f"The grace period is {group} days for premium payment."
483
 
484
+ # Enhanced waiting period patterns
485
  if 'waiting period' in question_lower:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  patterns = [
487
  r'waiting period[^.]*?(\d+)\s*(days?|months?)',
488
  r'(\d+)\s*(days?|months?)[^.]*?waiting period',
489
  r'wait.*?(\d+)\s*(days?|months?)',
490
+ r'(\d+)\s*(months?|days?)[^.]*?wait',
491
+ r'coverage.*?after.*?(\d+)\s*(months?|days?)'
492
  ]
493
+
494
  for pattern in patterns:
495
  match = re.search(pattern, context_lower)
496
+ if match and len(match.groups()) >= 2:
497
+ number = match.group(1)
498
+ unit = match.group(2)
499
+ if number and number.isdigit():
500
+ return f"The waiting period is {number} {unit}."
501
+
502
+ return None
503
+
504
+ def _fuzzy_answer_extraction(self, question: str, context: str) -> Optional[str]:
505
+ """Fuzzy matching for common insurance questions"""
506
+ question_lower = question.lower()
507
+ context_lower = context.lower()
508
+
509
+ # Grace period fuzzy matching
510
+ if any(word in question_lower for word in ['grace', 'premium payment']):
511
+ # Look for any mention of days with grace/premium
512
+ day_matches = re.findall(r'(\d+)\s*days?', context_lower)
513
+ if day_matches:
514
+ # Common insurance grace periods
515
+ for days in day_matches:
516
+ if days in ['30', 'fifteen', '15', 'thirty']:
517
+ if 'grace' in context_lower or 'premium' in context_lower:
518
+ return f"The grace period is {days} days for premium payment."
519
 
520
  # Maternity coverage
521
  if 'maternity' in question_lower:
522
+ if 'maternity' in context_lower:
523
+ if any(word in context_lower for word in ['covered', 'included', 'benefit']):
524
+ return "Yes, maternity is covered under the policy."
525
+ elif any(word in context_lower for word in ['excluded', 'not covered']):
526
+ return "No, maternity is not covered under the policy."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
+ return None
529
+
530
+ def _context_search_answer(self, question: str, context: str) -> Optional[str]:
531
+ """Search context for relevant sentences"""
532
+ question_lower = question.lower()
533
+ context_sentences = re.split(r'[.!?]+', context)
 
 
 
 
 
 
 
534
 
535
+ question_keywords = set(re.findall(r'\b\w+\b', question_lower))
536
+ question_keywords.discard('what')
537
+ question_keywords.discard('is')
538
+ question_keywords.discard('the')
539
+ question_keywords.discard('are')
 
 
 
 
 
 
 
 
 
540
 
541
+ best_sentence = ""
542
+ best_score = 0
543
+
544
+ for sentence in context_sentences:
545
+ if len(sentence.strip()) < 20:
546
+ continue
547
+
548
+ sentence_lower = sentence.lower()
549
+ sentence_words = set(re.findall(r'\b\w+\b', sentence_lower))
550
+
551
+ # Calculate overlap
552
+ overlap = question_keywords.intersection(sentence_words)
553
+ score = len(overlap)
554
+
555
+ # Boost for numbers and specific terms
556
+ if re.search(r'\d+', sentence_lower):
557
+ score += 2
558
+
559
+ if score > best_score and score > 1: # At least 2 overlapping words
560
+ best_score = score
561
+ best_sentence = sentence.strip()
562
+
563
+ if best_sentence and best_score >= 2:
564
+ return best_sentence + "."
565
 
566
  return None
567
 
 
570
  if not text:
571
  return "Information not available in the document."
572
 
573
+ # Clean the text
574
  text = re.sub(r'\n+', ' ', text)
575
  text = re.sub(r'\s+', ' ', text)
576
+ text = text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
+ # Take only first sentence if multiple
579
+ sentences = re.split(r'[.!?]+', text)
580
+ if sentences:
581
+ text = sentences[0].strip()
582
+ if text and not text.endswith(('.', '!', '?')):
583
+ text += '.'
584
 
585
+ return text if text else "Information not available in the document."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
  class EnhancedSingleDocumentSystem:
588
  """Enhanced system optimized for single document processing"""
 
599
  self.initialize_embeddings()
600
 
601
  def initialize_embeddings(self):
602
+ """Initialize embedding model with better error handling"""
603
  try:
604
  self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
605
+ self.embedding_model.max_seq_length = 256 # Reduced for better performance
606
  logger.info("Embedding model loaded: all-MiniLM-L6-v2")
607
  except Exception as e:
608
  logger.error(f"Embedding model error: {e}")
609
+ try:
610
+ # Fallback to a smaller model
611
+ self.embedding_model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
612
+ logger.info("Loaded fallback embedding model")
613
+ except Exception as e2:
614
+ logger.error(f"Fallback embedding model also failed: {e2}")
615
+ raise RuntimeError(f"No embedding model could be loaded: {str(e2)}")
616
 
617
  def process_document_optimized(self, url: str) -> Dict[str, Any]:
618
  """Process single document with comprehensive analysis"""
 
626
  if not response:
627
  return {'success': False, 'error': f'Failed to download document from {url}'}
628
 
629
+ logger.info(f"Downloaded document, size: {len(response.content)} bytes")
630
+
631
  # Determine document type and extract
632
  content_type = response.headers.get('content-type', '').lower()
633
+ logger.info(f"Content type: {content_type}")
634
+
635
  if 'pdf' in content_type or url.lower().endswith('.pdf'):
636
  structured_content = self.doc_processor.extract_pdf_optimized(response.content, url)
637
  elif 'docx' in content_type or url.lower().endswith('.docx'):
 
647
  'total_words': len(text_content.split()),
648
  'source_url': url
649
  }
650
+ logger.info("Processed as text document")
651
  except Exception as e:
652
  return {'success': False, 'error': f'Unsupported document type or encoding error: {str(e)}'}
653
 
654
+ full_text = structured_content.get('full_text', '')
655
+ logger.info(f"Extracted text length: {len(full_text)}")
656
+
657
+ if not full_text or len(full_text.strip()) < 50:
658
+ return {'success': False, 'error': 'No meaningful text content could be extracted from the document'}
659
 
660
  # Create optimized chunks
661
  self.document_chunks = self.chunker.create_smart_chunks(structured_content)
 
667
  chunk_texts = [chunk.text for chunk in self.document_chunks]
668
 
669
  try:
670
+ logger.info("Creating embeddings...")
671
  self.chunk_embeddings = self.embedding_model.encode(
672
  chunk_texts,
673
+ batch_size=4, # Reduced batch size
674
  show_progress_bar=False,
675
  convert_to_numpy=True,
676
  normalize_embeddings=True
 
681
  self.index = faiss.IndexFlatIP(dimension)
682
  self.index.add(self.chunk_embeddings.astype('float32'))
683
 
684
+ logger.info(f"Created FAISS index with {len(self.document_chunks)} chunks")
685
+
686
  except Exception as e:
687
+ logger.error(f"Embedding creation failed: {e}")
688
  return {'success': False, 'error': f'Embedding creation failed: {str(e)}'}
689
 
690
  self.document_processed = True
 
712
 
713
  for attempt in range(max_retries):
714
  try:
715
+ logger.info(f"Download attempt {attempt + 1} for {url}")
716
  response = requests.get(url, headers=headers, timeout=30, stream=True)
717
  response.raise_for_status()
718
  return response
 
723
 
724
  return None
725
 
726
+ def semantic_search_optimized(self, query: str, top_k: int = 8) -> List[DocumentChunk]:
727
  """Enhanced semantic search with better relevance scoring"""
728
  if not self.index or not self.document_chunks or not self.document_processed:
729
+ logger.warning("Document not processed or index not available")
730
  return []
731
 
732
  try:
733
+ logger.info(f"Searching for: {query}")
734
+
735
  # Create query embedding
736
  query_embedding = self.embedding_model.encode([query], normalize_embeddings=True)
737
 
738
+ # Search for candidates
739
+ search_k = min(top_k * 2, len(self.document_chunks))
740
  scores, indices = self.index.search(query_embedding.astype('float32'), search_k)
741
 
742
  # Enhanced scoring with keyword matching
 
745
 
746
  # Define query-specific keywords for boosting
747
  query_keywords = self._extract_query_keywords(query_lower)
748
+ logger.info(f"Query keywords: {query_keywords}")
749
 
750
  for score, idx in zip(scores[0], indices[0]):
751
  if 0 <= idx < len(self.document_chunks):
 
757
 
758
  # Keyword matching boost
759
  keyword_matches = sum(1 for keyword in query_keywords if keyword in chunk_text_lower)
760
+ boosted_score += keyword_matches * 0.3
761
 
762
  # Importance score boost
763
  boosted_score += chunk.importance_score * 0.1
764
 
765
  # Exact phrase matching boost
766
+ if 'grace period' in query_lower and 'grace period' in chunk_text_lower:
767
+ boosted_score += 0.5
768
+ if 'waiting period' in query_lower and 'waiting period' in chunk_text_lower:
769
+ boosted_score += 0.5
770
 
771
  # Number/percentage matching boost
772
  query_numbers = re.findall(r'\d+', query_lower)
773
  chunk_numbers = re.findall(r'\d+', chunk_text_lower)
774
  number_matches = len(set(query_numbers).intersection(set(chunk_numbers)))
775
+ boosted_score += number_matches * 0.2
776
 
777
+ logger.info(f"Chunk {idx}: base_score={score:.3f}, boosted={boosted_score:.3f}, keywords={keyword_matches}")
778
  boosted_results.append((boosted_score, idx, chunk))
779
 
780
  # Sort by boosted score
781
  boosted_results.sort(key=lambda x: x[0], reverse=True)
782
 
783
+ # Select top results
784
  top_chunks = []
785
+ for score, idx, chunk in boosted_results[:top_k]:
786
+ logger.info(f"Selected chunk {idx}: score={score:.3f}, text preview: {chunk.text[:100]}...")
 
787
  top_chunks.append(chunk)
788
 
789
  return top_chunks
 
795
  def _extract_query_keywords(self, query_lower: str) -> List[str]:
796
  """Extract relevant keywords from query for boosting"""
797
  # Remove common question words
798
+ stop_words = {'what', 'is', 'are', 'the', 'a', 'an', 'how', 'when', 'where', 'why', 'which', 'who', 'for', 'under'}
799
 
800
  words = re.findall(r'\b\w+\b', query_lower)
801
  keywords = [word for word in words if word not in stop_words and len(word) > 2]
 
806
  compound_terms.append('grace period')
807
  if 'waiting' in keywords and 'period' in keywords:
808
  compound_terms.append('waiting period')
809
+ if 'premium' in keywords and 'payment' in keywords:
810
+ compound_terms.append('premium payment')
811
  if 'sum' in keywords and 'insured' in keywords:
812
  compound_terms.append('sum insured')
 
 
 
 
813
 
814
  return keywords + compound_terms
815
 
816
+ def _build_optimized_context(self, question: str, chunks: List[DocumentChunk], max_length: int = 800) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
  """Build optimized context from top chunks"""
818
  if not chunks:
819
  return ""
 
821
  context_parts = []
822
  current_length = 0
823
 
824
+ # Prioritize chunks with higher importance scores
825
  sorted_chunks = sorted(chunks, key=lambda x: x.importance_score, reverse=True)
826
 
827
  for chunk in sorted_chunks:
828
+ chunk_text = chunk.text
829
  chunk_length = len(chunk_text)
830
 
831
  if current_length + chunk_length <= max_length:
832
  context_parts.append(chunk_text)
833
  current_length += chunk_length
834
  else:
835
+ # Add partial chunk if there's meaningful space left
836
  remaining_space = max_length - current_length
837
+ if remaining_space > 100:
838
  truncated = chunk_text[:remaining_space-3] + "..."
839
  context_parts.append(truncated)
840
  break
841
 
842
+ context = " ".join(context_parts)
843
+ logger.info(f"Built context of length: {len(context)}")
844
+ return context
845
 
846
  def process_single_query_optimized(self, question: str) -> Dict[str, Any]:
847
  """Process single query with enhanced accuracy"""
 
856
 
857
  start_time = time.time()
858
  try:
859
+ logger.info(f"Processing query: {question}")
860
+
861
  # Get relevant chunks
862
+ top_chunks = self.semantic_search_optimized(question, top_k=6)
863
 
864
  if not top_chunks:
865
+ logger.warning("No relevant chunks found")
866
  return {
867
  'answer': 'No relevant information found in the document for this question.',
868
  'confidence': 0.0,
 
874
  # Build comprehensive context
875
  context = self._build_optimized_context(question, top_chunks)
876
 
877
+ logger.info(f"Context preview: {context[:200]}...")
 
878
 
879
  # Generate answer
880
  result = self.qa_system.generate_answer(question, context, top_chunks)
881
+
882
+ logger.info(f"Generated answer: {result['answer']}")
883
  return result
884
 
885
  except Exception as e:
 
904
  }
905
 
906
  for i, question in enumerate(questions):
907
+ logger.info(f"Processing question {i+1}/{len(questions)}: {question}")
908
  result = self.process_single_query_optimized(question)
909
  answers.append(result['answer'])
910
 
 
943
  if not questions:
944
  return "No valid questions found. Please provide questions as JSON array or one per line."
945
 
946
+ logger.info(f"Processing URL: {url}")
947
+ logger.info(f"Processing questions: {questions}")
948
+
949
  # Process document
950
  doc_result = enhanced_system.process_document_optimized(url)
951
  if not doc_result.get("success"):
952
+ error_msg = f"Document processing failed: {doc_result.get('error')}"
953
+ logger.error(error_msg)
954
+ return error_msg
955
+
956
+ logger.info("Document processed successfully")
957
 
958
  # Process questions
959
  batch_result = enhanced_system.process_batch_queries_optimized(questions)
 
981
  if not url:
982
  return "No valid URL found. Please provide a document URL."
983
 
984
+ logger.info(f"Processing single question - URL: {url}, Question: {question}")
985
+
986
  # Process document
987
  doc_result = enhanced_system.process_document_optimized(url)
988
  if not doc_result.get("success"):
989
+ error_msg = f"Document processing failed: {doc_result.get('error')}"
990
+ logger.error(error_msg)
991
+ return error_msg
992
 
993
  # Process single question
994
  result = enhanced_system.process_single_query_optimized(question)
 
1021
  def single_query_wrapper(url_text, question):
1022
  return process_single_question(url_text, question)
1023
 
1024
+ # Create Gradio Interface
1025
  with gr.Blocks(
1026
  theme=gr.themes.Soft(
1027
  primary_hue="blue",
1028
  secondary_hue="indigo",
1029
  neutral_hue="slate",
 
1030
  ),
1031
+ title="Enhanced Document QA System"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1032
  ) as demo:
1033
 
1034
+ gr.Markdown("""
1035
+ # 🎯 Enhanced Single Document QA System
1036
+ **Optimized for Accurate Insurance Document Analysis**
1037
+
1038
+ This system can process PDF and DOCX documents to answer questions about their content.
1039
+ """)
 
 
1040
 
1041
+ with gr.Tab("🚀 Hackathon Mode"):
1042
+ gr.Markdown("### Process multiple questions in hackathon format")
1043
+
1044
  with gr.Row():
1045
+ with gr.Column():
1046
+ hack_url = gr.Textbox(
1047
+ label="📄 Document URL",
1048
+ placeholder="https://example.com/insurance-policy.pdf",
1049
+ lines=2
1050
+ )
1051
+
1052
+ hack_questions = gr.Textbox(
1053
+ label="❓ Questions (JSON format)",
1054
+ placeholder='["What is the grace period?", "Is maternity covered?"]',
1055
+ lines=6
1056
+ )
1057
+
1058
+ hack_submit_btn = gr.Button("🚀 Process Questions", variant="primary")
1059
 
1060
+ with gr.Column():
1061
+ hack_output = gr.Textbox(
1062
+ label="📊 Results",
1063
+ lines=20,
1064
+ interactive=False
1065
+ )
1066
+
1067
+ hack_submit_btn.click(
1068
+ fn=hackathon_wrapper,
1069
+ inputs=[hack_url, hack_questions],
1070
+ outputs=[hack_output]
1071
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1072
 
1073
+ with gr.Tab("🔍 Single Query"):
1074
+ gr.Markdown("### Ask detailed questions about the document")
1075
+
1076
+ with gr.Row():
1077
+ with gr.Column():
1078
+ single_url = gr.Textbox(
1079
+ label="📄 Document URL",
1080
+ placeholder="https://example.com/insurance-policy.pdf",
1081
+ lines=2
1082
+ )
1083
 
1084
+ single_question = gr.Textbox(
1085
+ label=" Your Question",
1086
+ placeholder="What is the grace period for premium payment?",
1087
+ lines=3
1088
+ )
1089
+
1090
+ single_submit_btn = gr.Button("🔍 Get Answer", variant="primary")
1091
+
1092
+ with gr.Column():
1093
+ single_output = gr.Textbox(
1094
+ label="📋 Detailed Response",
1095
+ lines=20,
1096
+ interactive=False
1097
+ )
1098
+
1099
+ single_submit_btn.click(
1100
+ fn=single_query_wrapper,
1101
+ inputs=[single_url, single_question],
1102
+ outputs=[single_output]
1103
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1104
 
1105
  # Configure for deployment
1106
+ demo.queue(max_size=10, concurrency_count=2)
1107
 
1108
+ # Mount Gradio on FastAPI
1109
  app = gr.mount_gradio_app(api_app, demo, path="/")
1110
 
1111
+ # Main execution
1112
  if __name__ == "__main__":
1113
+ print("Starting Enhanced Document QA System...")
1114
+ print(f"Gradio version: {gr.__version__}")
 
 
 
1115
 
1116
+ # Get port from environment or use default
1117
+ port = int(os.getenv("PORT", 7860))
1118
+
1119
+ # Use uvicorn to run the app
1120
  uvicorn.run(
1121
  app,
1122
+ host="0.0.0.0",
1123
+ port=port,
1124
+ log_level="info"
1125
  )
requirements.txt CHANGED
@@ -1,12 +1,12 @@
1
  gradio==4.44.0
2
- fastapi
3
- uvicorn
4
- transformers>=4.38.0
5
- sentence-transformers
6
- faiss-cpu
7
- numpy
8
- requests
9
- pypdf2
10
- python-docx
11
- torch==2.3.1
12
- uvicorn
 
1
  gradio==4.44.0
2
+ transformers==4.36.0
3
+ torch==2.1.0
4
+ faiss-cpu==1.7.4
5
+ numpy==1.24.3
6
+ sentence-transformers==2.2.2
7
+ PyPDF2==3.0.1
8
+ python-docx==0.8.11
9
+ requests==2.31.0
10
+ fastapi==0.104.1
11
+ uvicorn==0.24.0
12
+ logging