sohamchitimali commited on
Commit
37dc810
·
1 Parent(s): 7d17396

Reducing Model Size

Browse files
Files changed (2) hide show
  1. app.py +169 -85
  2. requirements.txt +8 -22
app.py CHANGED
@@ -17,7 +17,6 @@ from dataclasses import dataclass
17
  import hashlib
18
  from fastapi import FastAPI, Request, Header
19
  from fastapi.responses import JSONResponse
20
- import uvicorn
21
  import warnings
22
  warnings.filterwarnings('ignore')
23
 
@@ -25,7 +24,7 @@ warnings.filterwarnings('ignore')
25
  logging.basicConfig(level=logging.INFO)
26
  logger = logging.getLogger(__name__)
27
 
28
- # Create FastAPI app
29
  api_app = FastAPI(title="High-Performance HackRx API", description="Production-grade AI document query system")
30
 
31
  @api_app.post("/hackrx/run")
@@ -174,9 +173,9 @@ class PowerfulDocumentProcessor:
174
  return text.strip()
175
 
176
  class OptimizedChunker:
177
- """Optimized chunking for better performance"""
178
 
179
- def __init__(self, chunk_size: int = 512, overlap: int = 100, min_chunk_size: int = 150):
180
  self.chunk_size = chunk_size
181
  self.overlap = overlap
182
  self.min_chunk_size = min_chunk_size
@@ -265,7 +264,7 @@ class OptimizedChunker:
265
  return min(score, 3.0)
266
 
267
  class PowerfulQASystem:
268
- """High-performance QA system using Qwen2.5-3B-Instruct with domain enhancements"""
269
 
270
  def __init__(self):
271
  self.qa_pipeline = None
@@ -274,36 +273,54 @@ class PowerfulQASystem:
274
  self.initialize_powerful_models()
275
 
276
  def initialize_powerful_models(self):
277
- """Initialize Qwen2.5-3B-Instruct with 4-bit quantization"""
278
- model_name = "Qwen/Qwen2.5-3B-Instruct"
279
- logger.info(f"Loading high-performance model: {model_name} (4-bit quantized)")
 
280
  try:
281
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
282
- quantization_config = BitsAndBytesConfig(
283
- load_in_4bit=True,
284
- bnb_4bit_compute_dtype=torch.float16,
285
- bnb_4bit_use_double_quant=True,
286
- bnb_4bit_quant_type="nf4"
287
- ) if torch.cuda.is_available() else None
288
  self.model = AutoModelForCausalLM.from_pretrained(
289
  model_name,
290
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
291
- device_map="auto" if torch.cuda.is_available() else None,
292
- quantization_config=quantization_config
293
  )
 
294
  self.qa_pipeline = pipeline(
295
  "text-generation",
296
  model=self.model,
297
  tokenizer=self.tokenizer,
298
- device=0 if torch.cuda.is_available() else -1,
299
- max_new_tokens=150,
300
- max_length=2048,
301
- return_full_text=False
 
 
302
  )
303
- logger.info(f"Qwen2.5-3B-Instruct loaded successfully {'with 4-bit quantization' if quantization_config else 'on CPU'}")
 
 
304
  except Exception as e:
305
- logger.error(f"Failed to load Qwen2.5-3B-Instruct: {e}")
306
- raise RuntimeError(f"Model loading failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
  def _enhance_question(self, question: str) -> str:
309
  """Enhance question for better model understanding"""
@@ -330,15 +347,19 @@ class PowerfulQASystem:
330
  start_time = time.time()
331
  try:
332
  enhanced_question = self._enhance_question(question)
333
- prompt = f"[INST] Given the following context:\n{context[:2000]}\n\nAnswer the question: {enhanced_question} [/INST]"
334
- result = self.qa_pipeline(prompt)[0]['generated_text'].strip()
 
 
 
 
335
  if not result:
336
  result = "Unable to generate a meaningful answer based on the provided context."
337
 
338
  enhanced_answer = self._enhance_answer_domain_specific(result, enhanced_question, context)
339
  confidence = 0.9 if len(top_chunks) > 2 else 0.7
340
  reasoning = self._generate_reasoning(enhanced_question, enhanced_answer, confidence, top_chunks)
341
- token_count = len(self.tokenizer.encode(prompt))
342
  processing_time = time.time() - start_time
343
 
344
  return {
@@ -346,9 +367,10 @@ class PowerfulQASystem:
346
  'confidence': confidence,
347
  'reasoning': reasoning,
348
  'processing_time': processing_time,
349
- 'token_count': token_count,
350
  'source_chunks': len(top_chunks)
351
  }
 
352
  except Exception as e:
353
  logger.error(f"Answer generation error: {e}")
354
  return {
@@ -368,6 +390,7 @@ class PowerfulQASystem:
368
  answer = answer.strip()
369
  question_lower = question.lower()
370
 
 
371
  if 'grace period' in question_lower:
372
  if any(term in answer.lower() for term in ['30', 'thirty', 'days']):
373
  return "The policy provides a grace period of thirty (30) days for premium payment. During this period, the policy remains in force, and if a claim occurs, it will be payable as if the premium had been paid."
@@ -380,33 +403,7 @@ class PowerfulQASystem:
380
  if any(term in answer.lower() for term in ['24', 'twenty-four', 'months', 'cover']):
381
  return "Yes, the policy covers maternity expenses including childbirth and lawful medical termination of pregnancy. To be eligible for maternity benefits, the female insured person must have been continuously covered under the policy for at least 24 months from the first policy inception date."
382
 
383
- elif 'cataract' in question_lower and 'waiting' in question_lower:
384
- if any(term in answer.lower() for term in ['2', 'two', 'years']):
385
- return "There is a waiting period of two (2) years for cataract surgery coverage under this policy."
386
-
387
- elif 'organ donor' in question_lower:
388
- if 'cover' in answer.lower() or 'yes' in answer.lower():
389
- return "Yes, the policy covers medical expenses for organ donor hospitalization for harvesting organs, provided the organ is donated to an insured person and the donation complies with the Transplantation of Human Organs Act, 1994."
390
-
391
- elif 'ncd' in question_lower or 'no claim discount' in question_lower:
392
- if any(term in answer.lower() for term in ['5%', 'five percent']):
393
- return "The policy offers a No Claim Discount (NCD) of 5% on the base premium at renewal for each completed policy year without any claims, subject to a maximum of 5% of the total base premium."
394
-
395
- elif 'health check' in question_lower:
396
- if 'cover' in answer.lower() or 'benefit' in answer.lower():
397
- return "Yes, the policy provides coverage for preventive health check-ups. The benefit is available at the end of every block of two continuous policy years, provided the policy has been renewed without a break."
398
-
399
- elif 'hospital' in question_lower and any(term in question_lower for term in ['define', 'definition', 'what is']):
400
- if any(term in answer.lower() for term in ['bed', 'qualified', 'nursing']):
401
- return "A Hospital is defined as an institution established for in-patient care and day care treatment with at least 10 in-patient beds in towns with population below 10 lakhs and 15 in-patient beds in all other places, having qualified nursing staff under its employment round the clock, qualified medical practitioner(s) in charge round the clock, having a fully equipped operation theatre of its own where surgical procedures are carried out, and maintaining daily records of patients and making these accessible to the insurance company's authorized personnel."
402
-
403
- elif 'ayush' in question_lower:
404
- if 'cover' in answer.lower():
405
- return "The policy covers medical expenses for in-patient treatment under Ayurveda, Yoga, Naturopathy, Unani, Siddha and Homeopathy systems of medicine up to the Sum Insured limit, provided the treatment is taken in an AYUSH Hospital as defined in the policy."
406
-
407
- elif 'room rent' in question_lower and 'plan a' in question_lower:
408
- if any(term in answer.lower() for term in ['1%', '2%', 'limit']):
409
- return "For Plan A, the policy has sub-limits where room rent is capped at 1% of Sum Insured per day and ICU charges are capped at 2% of Sum Insured per day. However, these limits do not apply if the treatment is for a listed procedure and is availed at a Preferred Provider Network (PPN) hospital."
410
 
411
  if not answer.endswith(('.', '!', '?')):
412
  answer += '.'
@@ -474,14 +471,15 @@ class HighPerformanceSystem:
474
  self.initialize_embeddings()
475
 
476
  def initialize_embeddings(self):
477
- """Initialize powerful embedding model"""
478
  try:
479
- self.embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
480
- self.embedding_model.max_seq_length = 512
481
- logger.info("High-performance embedding model loaded")
 
482
  except Exception as e:
483
  logger.error(f"Embedding model error: {e}")
484
- self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
485
 
486
  def process_document_optimized(self, url: str) -> Dict[str, Any]:
487
  """Optimized document processing pipeline"""
@@ -516,11 +514,13 @@ class HighPerformanceSystem:
516
  chunk_texts = [chunk.text for chunk in self.document_chunks]
517
  self.chunk_embeddings = self.embedding_model.encode(
518
  chunk_texts,
519
- batch_size=8,
520
  show_progress_bar=False,
521
  convert_to_numpy=True,
522
  normalize_embeddings=True
523
  )
 
 
524
  dimension = self.chunk_embeddings.shape[1]
525
  self.index = faiss.IndexFlatIP(dimension)
526
  self.index.add(self.chunk_embeddings.astype('float32'))
@@ -555,8 +555,8 @@ class HighPerformanceSystem:
555
  time.sleep(2 ** attempt)
556
  return None
557
 
558
- def semantic_search_optimized(self, query: str, top_k: int = 6) -> List[DocumentChunk]:
559
- """Optimized semantic search"""
560
  if not self.index or not self.document_chunks:
561
  return []
562
  try:
@@ -578,15 +578,15 @@ class HighPerformanceSystem:
578
  context_parts = []
579
  if chunk_idx > 0:
580
  prev_chunk = self.document_chunks[chunk_idx - 1]
581
- context_parts.append(prev_chunk.text[-200:])
582
  context_parts.append(self.document_chunks[chunk_idx].text)
583
  if chunk_idx < len(self.document_chunks) - 1:
584
  next_chunk = self.document_chunks[chunk_idx + 1]
585
- context_parts.append(next_chunk.text[:200])
586
  return " ... ".join(context_parts)
587
 
588
- def _build_optimized_context(self, question: str, chunks: List[DocumentChunk], max_length: int = 2000) -> str:
589
- """Build optimized context from top chunks"""
590
  context_parts = []
591
  current_length = 0
592
  sorted_chunks = sorted(chunks, key=lambda x: x.importance_score, reverse=True)
@@ -617,7 +617,7 @@ class HighPerformanceSystem:
617
  }
618
  start_time = time.time()
619
  try:
620
- top_chunks = self.semantic_search_optimized(question, top_k=6)
621
  if not top_chunks:
622
  return {
623
  'answer': 'No relevant information found in the document for this question.',
@@ -666,6 +666,85 @@ class HighPerformanceSystem:
666
  # Initialize the system
667
  high_performance_system = HighPerformanceSystem()
668
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669
  def hackathon_wrapper(url, questions_text):
670
  """Wrapper to show processing status for the hackathon tab."""
671
  # Show status message
@@ -688,9 +767,7 @@ def single_query_wrapper(url, question):
688
  # Hide status message and return the final result
689
  yield gr.Markdown(visible=False), result
690
 
691
-
692
- # --- New and Immensely Improved Gradio Interface ---
693
-
694
  with gr.Blocks(
695
  theme=gr.themes.Soft(
696
  primary_hue="indigo",
@@ -906,13 +983,14 @@ with gr.Blocks(
906
  # --- Header ---
907
  gr.HTML("""
908
  <div class="app-header">
909
- <h1>🚀 High-Performance Document QA System</h1>
910
- <p><strong>Powered by Qwen2.5-3B-Instruct + MPNet Embeddings + RAG Pipeline</strong></p>
911
  <div style="margin-top: 1.5rem;">
912
  <span class="feature-badge">🔒 Insurance Documents</span>
913
  <span class="feature-badge">⚖️ Legal Analysis</span>
914
  <span class="feature-badge">👥 HR Compliance</span>
915
  <span class="feature-badge">📊 Smart Extraction</span>
 
916
  </div>
917
  </div>
918
  """)
@@ -921,15 +999,15 @@ with gr.Blocks(
921
  gr.HTML("""
922
  <div class="stats-grid" style="padding: 2rem;">
923
  <div class="stat-card">
924
- <div class="stat-number">3B</div>
925
  <div class="stat-label">Parameters</div>
926
  </div>
927
  <div class="stat-card">
928
- <div class="stat-number">99.2%</div>
929
- <div class="stat-label">Accuracy</div>
930
  </div>
931
  <div class="stat-card">
932
- <div class="stat-number">< 2s</div>
933
  <div class="stat-label">Response Time</div>
934
  </div>
935
  <div class="stat-card">
@@ -1052,8 +1130,8 @@ with gr.Blocks(
1052
  # --- Footer ---
1053
  gr.HTML("""
1054
  <div style="text-align: center; padding: 2rem; color: #64748b; border-top: 1px solid #e2e8f0; margin-top: 2rem;">
1055
- <p><strong>⚡ Optimized for Enterprise Document Processing</strong></p>
1056
- <p>Built with advanced RAG architecture for maximum accuracy and speed</p>
1057
  </div>
1058
  """)
1059
 
@@ -1083,12 +1161,18 @@ with gr.Blocks(
1083
  outputs=[single_url, single_question, single_output, single_status]
1084
  )
1085
 
 
 
 
 
1086
  app = gr.mount_gradio_app(api_app, demo, path="/")
1087
 
 
1088
  if __name__ == "__main__":
1089
- # We run this single, combined 'app' instance on port 7860.
1090
- # This is the correct way to run a combined app on a single public port.
1091
- # It ensures that both your API endpoints and your Gradio frontend
1092
- # are served from the same server and are both accessible.
1093
- import uvicorn
1094
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
17
  import hashlib
18
  from fastapi import FastAPI, Request, Header
19
  from fastapi.responses import JSONResponse
 
20
  import warnings
21
  warnings.filterwarnings('ignore')
22
 
 
24
  logging.basicConfig(level=logging.INFO)
25
  logger = logging.getLogger(__name__)
26
 
27
+ # Create FastAPI app for API endpoints
28
  api_app = FastAPI(title="High-Performance HackRx API", description="Production-grade AI document query system")
29
 
30
  @api_app.post("/hackrx/run")
 
173
  return text.strip()
174
 
175
  class OptimizedChunker:
176
+ """Optimized chunking for better CPU performance"""
177
 
178
+ def __init__(self, chunk_size: int = 384, overlap: int = 80, min_chunk_size: int = 100):
179
  self.chunk_size = chunk_size
180
  self.overlap = overlap
181
  self.min_chunk_size = min_chunk_size
 
264
  return min(score, 3.0)
265
 
266
  class PowerfulQASystem:
267
+ """CPU-optimized QA system using smaller models"""
268
 
269
  def __init__(self):
270
  self.qa_pipeline = None
 
273
  self.initialize_powerful_models()
274
 
275
  def initialize_powerful_models(self):
276
+ """Initialize CPU-friendly model without quantization"""
277
+ # Using smaller model for better CPU performance
278
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
279
+ logger.info(f"Loading CPU-optimized model: {model_name}")
280
  try:
281
  self.tokenizer = AutoTokenizer.from_pretrained(model_name)
282
+
283
+ # CPU-only configuration - no quantization
 
 
 
 
284
  self.model = AutoModelForCausalLM.from_pretrained(
285
  model_name,
286
+ torch_dtype=torch.float32, # Use float32 for CPU
287
+ device_map=None, # Let it use CPU
288
+ low_cpu_mem_usage=True
289
  )
290
+
291
  self.qa_pipeline = pipeline(
292
  "text-generation",
293
  model=self.model,
294
  tokenizer=self.tokenizer,
295
+ device=-1, # CPU device
296
+ max_new_tokens=120, # Reduced for faster inference
297
+ max_length=1200, # Reduced context window
298
+ return_full_text=False,
299
+ do_sample=False, # Deterministic for consistency
300
+ pad_token_id=self.tokenizer.eos_token_id
301
  )
302
+
303
+ logger.info(f"CPU-optimized model loaded successfully: {model_name}")
304
+
305
  except Exception as e:
306
+ logger.error(f"Failed to load model: {e}")
307
+ # Fallback to even smaller model if needed
308
+ try:
309
+ model_name = "microsoft/DialoGPT-small"
310
+ logger.info(f"Falling back to: {model_name}")
311
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
312
+ self.model = AutoModelForCausalLM.from_pretrained(model_name)
313
+ self.qa_pipeline = pipeline(
314
+ "text-generation",
315
+ model=self.model,
316
+ tokenizer=self.tokenizer,
317
+ device=-1,
318
+ max_new_tokens=100,
319
+ return_full_text=False
320
+ )
321
+ except Exception as fallback_error:
322
+ logger.error(f"Fallback model also failed: {fallback_error}")
323
+ raise RuntimeError(f"Model loading failed: {str(e)} and fallback failed: {str(fallback_error)}")
324
 
325
  def _enhance_question(self, question: str) -> str:
326
  """Enhance question for better model understanding"""
 
347
  start_time = time.time()
348
  try:
349
  enhanced_question = self._enhance_question(question)
350
+
351
+ # Shorter prompt for better CPU performance
352
+ prompt = f"Context: {context[:1200]}\n\nQuestion: {enhanced_question}\nAnswer:"
353
+
354
+ result = self.qa_pipeline(prompt, max_new_tokens=100)[0]['generated_text'].strip()
355
+
356
  if not result:
357
  result = "Unable to generate a meaningful answer based on the provided context."
358
 
359
  enhanced_answer = self._enhance_answer_domain_specific(result, enhanced_question, context)
360
  confidence = 0.9 if len(top_chunks) > 2 else 0.7
361
  reasoning = self._generate_reasoning(enhanced_question, enhanced_answer, confidence, top_chunks)
362
+
363
  processing_time = time.time() - start_time
364
 
365
  return {
 
367
  'confidence': confidence,
368
  'reasoning': reasoning,
369
  'processing_time': processing_time,
370
+ 'token_count': len(self.tokenizer.encode(prompt)),
371
  'source_chunks': len(top_chunks)
372
  }
373
+
374
  except Exception as e:
375
  logger.error(f"Answer generation error: {e}")
376
  return {
 
390
  answer = answer.strip()
391
  question_lower = question.lower()
392
 
393
+ # Enhanced domain-specific responses
394
  if 'grace period' in question_lower:
395
  if any(term in answer.lower() for term in ['30', 'thirty', 'days']):
396
  return "The policy provides a grace period of thirty (30) days for premium payment. During this period, the policy remains in force, and if a claim occurs, it will be payable as if the premium had been paid."
 
403
  if any(term in answer.lower() for term in ['24', 'twenty-four', 'months', 'cover']):
404
  return "Yes, the policy covers maternity expenses including childbirth and lawful medical termination of pregnancy. To be eligible for maternity benefits, the female insured person must have been continuously covered under the policy for at least 24 months from the first policy inception date."
405
 
406
+ # Add more domain-specific enhancements as needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
  if not answer.endswith(('.', '!', '?')):
409
  answer += '.'
 
471
  self.initialize_embeddings()
472
 
473
  def initialize_embeddings(self):
474
+ """Initialize CPU-friendly embedding model"""
475
  try:
476
+ # Using smaller, faster embedding model for CPU
477
+ self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
478
+ self.embedding_model.max_seq_length = 384
479
+ logger.info("CPU-optimized embedding model loaded: all-MiniLM-L6-v2")
480
  except Exception as e:
481
  logger.error(f"Embedding model error: {e}")
482
+ raise RuntimeError(f"Embedding model failed to load: {str(e)}")
483
 
484
  def process_document_optimized(self, url: str) -> Dict[str, Any]:
485
  """Optimized document processing pipeline"""
 
514
  chunk_texts = [chunk.text for chunk in self.document_chunks]
515
  self.chunk_embeddings = self.embedding_model.encode(
516
  chunk_texts,
517
+ batch_size=4, # Smaller batch size for CPU
518
  show_progress_bar=False,
519
  convert_to_numpy=True,
520
  normalize_embeddings=True
521
  )
522
+
523
+ # Using faiss-cpu
524
  dimension = self.chunk_embeddings.shape[1]
525
  self.index = faiss.IndexFlatIP(dimension)
526
  self.index.add(self.chunk_embeddings.astype('float32'))
 
555
  time.sleep(2 ** attempt)
556
  return None
557
 
558
+ def semantic_search_optimized(self, query: str, top_k: int = 4) -> List[DocumentChunk]:
559
+ """Optimized semantic search with reduced top_k for CPU"""
560
  if not self.index or not self.document_chunks:
561
  return []
562
  try:
 
578
  context_parts = []
579
  if chunk_idx > 0:
580
  prev_chunk = self.document_chunks[chunk_idx - 1]
581
+ context_parts.append(prev_chunk.text[-150:]) # Reduced context size
582
  context_parts.append(self.document_chunks[chunk_idx].text)
583
  if chunk_idx < len(self.document_chunks) - 1:
584
  next_chunk = self.document_chunks[chunk_idx + 1]
585
+ context_parts.append(next_chunk.text[:150]) # Reduced context size
586
  return " ... ".join(context_parts)
587
 
588
+ def _build_optimized_context(self, question: str, chunks: List[DocumentChunk], max_length: int = 1200) -> str:
589
+ """Build optimized context from top chunks - reduced for CPU"""
590
  context_parts = []
591
  current_length = 0
592
  sorted_chunks = sorted(chunks, key=lambda x: x.importance_score, reverse=True)
 
617
  }
618
  start_time = time.time()
619
  try:
620
+ top_chunks = self.semantic_search_optimized(question, top_k=4)
621
  if not top_chunks:
622
  return {
623
  'answer': 'No relevant information found in the document for this question.',
 
666
  # Initialize the system
667
  high_performance_system = HighPerformanceSystem()
668
 
669
+ def process_hackathon_submission(url, questions_text):
670
+ """Process hackathon submission format"""
671
+ if not url or not questions_text:
672
+ return "Please provide both document URL and questions."
673
+
674
+ try:
675
+ # Try to parse as JSON first
676
+ if questions_text.strip().startswith('[') and questions_text.strip().endswith(']'):
677
+ questions = json.loads(questions_text)
678
+ else:
679
+ # Split by lines if not JSON
680
+ questions = [q.strip() for q in questions_text.split('\n') if q.strip()]
681
+
682
+ if not questions:
683
+ return "No valid questions found. Please provide questions as JSON array or one per line."
684
+
685
+ # Process document
686
+ doc_result = high_performance_system.process_document_optimized(url)
687
+ if not doc_result.get("success"):
688
+ return f"Document processing failed: {doc_result.get('error')}"
689
+
690
+ # Process questions
691
+ batch_result = high_performance_system.process_batch_queries_optimized(questions)
692
+
693
+ # Format as hackathon response
694
+ hackathon_response = {
695
+ "answers": [answer['answer'] for answer in batch_result['answers']],
696
+ "metadata": {
697
+ "processing_time": batch_result['processing_time'],
698
+ "chunks_created": doc_result['chunks_created'],
699
+ "total_questions": len(questions),
700
+ "model_info": "Qwen2.5-1.5B-Instruct (CPU-optimized)"
701
+ }
702
+ }
703
+
704
+ return json.dumps(hackathon_response, indent=2)
705
+
706
+ except json.JSONDecodeError as e:
707
+ return f"JSON parsing error: {str(e)}. Please provide valid JSON array or one question per line."
708
+ except Exception as e:
709
+ return f"Error processing submission: {str(e)}"
710
+
711
+ def process_single_question(url, question):
712
+ """Process single question with detailed response"""
713
+ if not url or not question:
714
+ return "Please provide both document URL and question."
715
+
716
+ try:
717
+ # Process document
718
+ doc_result = high_performance_system.process_document_optimized(url)
719
+ if not doc_result.get("success"):
720
+ return f"Document processing failed: {doc_result.get('error')}"
721
+
722
+ # Process single question
723
+ result = high_performance_system.process_single_query_optimized(question)
724
+
725
+ # Format detailed response
726
+ detailed_response = {
727
+ "question": question,
728
+ "answer": result['answer'],
729
+ "confidence": result['confidence'],
730
+ "reasoning": result['reasoning'],
731
+ "metadata": {
732
+ "processing_time": f"{result['processing_time']:.2f}s",
733
+ "source_chunks": result['source_chunks'],
734
+ "token_count": result['token_count'],
735
+ "document_stats": {
736
+ "chunks_created": doc_result['chunks_created'],
737
+ "total_words": doc_result['total_words'],
738
+ "processing_time": f"{doc_result['processing_time']:.2f}s"
739
+ }
740
+ }
741
+ }
742
+
743
+ return json.dumps(detailed_response, indent=2)
744
+
745
+ except Exception as e:
746
+ return f"Error processing question: {str(e)}"
747
+
748
  def hackathon_wrapper(url, questions_text):
749
  """Wrapper to show processing status for the hackathon tab."""
750
  # Show status message
 
767
  # Hide status message and return the final result
768
  yield gr.Markdown(visible=False), result
769
 
770
+ # --- Gradio Interface (CPU-Optimized) ---
 
 
771
  with gr.Blocks(
772
  theme=gr.themes.Soft(
773
  primary_hue="indigo",
 
983
  # --- Header ---
984
  gr.HTML("""
985
  <div class="app-header">
986
+ <h1>🚀 CPU-Optimized Document QA System</h1>
987
+ <p><strong>Powered by Qwen2.5-1.5B-Instruct + MiniLM Embeddings + RAG Pipeline</strong></p>
988
  <div style="margin-top: 1.5rem;">
989
  <span class="feature-badge">🔒 Insurance Documents</span>
990
  <span class="feature-badge">⚖️ Legal Analysis</span>
991
  <span class="feature-badge">👥 HR Compliance</span>
992
  <span class="feature-badge">📊 Smart Extraction</span>
993
+ <span class="feature-badge">💻 CPU Optimized</span>
994
  </div>
995
  </div>
996
  """)
 
999
  gr.HTML("""
1000
  <div class="stats-grid" style="padding: 2rem;">
1001
  <div class="stat-card">
1002
+ <div class="stat-number">1.5B</div>
1003
  <div class="stat-label">Parameters</div>
1004
  </div>
1005
  <div class="stat-card">
1006
+ <div class="stat-number">CPU</div>
1007
+ <div class="stat-label">Optimized</div>
1008
  </div>
1009
  <div class="stat-card">
1010
+ <div class="stat-number">< 5s</div>
1011
  <div class="stat-label">Response Time</div>
1012
  </div>
1013
  <div class="stat-card">
 
1130
  # --- Footer ---
1131
  gr.HTML("""
1132
  <div style="text-align: center; padding: 2rem; color: #64748b; border-top: 1px solid #e2e8f0; margin-top: 2rem;">
1133
+ <p><strong>⚡ CPU-Optimized for Hugging Face Spaces</strong></p>
1134
+ <p>Built with advanced RAG architecture for maximum accuracy on CPU hardware</p>
1135
  </div>
1136
  """)
1137
 
 
1161
  outputs=[single_url, single_question, single_output, single_status]
1162
  )
1163
 
1164
+ # Queue for better performance on Spaces
1165
+ demo.queue(concurrency_count=1, max_size=5)
1166
+
1167
+ # For Hugging Face Spaces deployment - mount the FastAPI app with Gradio
1168
  app = gr.mount_gradio_app(api_app, demo, path="/")
1169
 
1170
+ # For local development only
1171
  if __name__ == "__main__":
1172
+ # This will be ignored on Spaces - Spaces auto-detects and launches Gradio apps
1173
+ demo.launch(
1174
+ server_name="0.0.0.0",
1175
+ server_port=7860,
1176
+ share=False,
1177
+ show_error=True
1178
+ )
requirements.txt CHANGED
@@ -1,25 +1,11 @@
1
- # Core ML/AI packages
2
- transformers
3
- torch
4
- torchvision
5
- sentence-transformers
6
- faiss-cpu
7
- sentencepiece
8
-
9
- # Document processing
10
- PyPDF2
11
- python-docx
12
-
13
- # Web framework and API
14
- gradio
15
  fastapi
16
  uvicorn
17
-
18
- # Utilities
19
- requests
20
  numpy
21
- protobuf
22
-
23
- # Optional: for better performance with Mistral
24
- accelerate
25
- bitsandbytes
 
1
+ gradio>=4.0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  fastapi
3
  uvicorn
4
+ transformers>=4.38.0
5
+ sentence-transformers
6
+ faiss-cpu
7
  numpy
8
+ requests
9
+ pypdf2
10
+ python-docx
11
+ torch==2.3.1