dnj0 commited on
Commit
77f58e2
·
verified ·
1 Parent(s): 54040b2

Update src/rag_system.py

Browse files
Files changed (1) hide show
  1. src/rag_system.py +227 -1
src/rag_system.py CHANGED
@@ -387,4 +387,230 @@ Summary (2-3 sentences maximum):"""
387
 
388
  def get_visual_summaries_log(self) -> List[Dict]:
389
  """Get all visual analysis logs"""
390
- return self.visual_summaries_log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
 
388
  def get_visual_summaries_log(self) -> List[Dict]:
389
  """Get all visual analysis logs"""
390
+ return self.visual_summaries_log
391
+
392
+
393
+ class AnsweringRAG:
394
+ """
395
+ RAG system that:
396
+ 1. Searches vector store for relevant content
397
+ 2. ANALYZES search results
398
+ 3. Generates intelligent answers based on context
399
+ """
400
+
401
+ def __init__(self, api_key: str = None, debug: bool = True):
402
+ api_key = api_key or OPENAI_API_KEY
403
+ self.debug = debug
404
+
405
+ self.llm = ChatOpenAI(
406
+ model_name="gpt-4o", # Use gpt-4o for better understanding
407
+ api_key=api_key,
408
+ temperature=TEMPERATURE,
409
+ max_tokens=MAX_TOKENS,
410
+ )
411
+
412
+ self.language = LANGUAGE
413
+ self.answer_log = []
414
+
415
+ if self.debug:
416
+ print("✅ AnsweringRAG initialized with answer generation")
417
+
418
+ def _debug_print(self, label: str, data: any):
419
+ """Print debug information"""
420
+ if self.debug:
421
+ print(f"\n🔍 DEBUG [{label}]:")
422
+ if isinstance(data, (list, dict)):
423
+ print(f" Type: {type(data).__name__}")
424
+ print(f" Content: {str(data)[:300]}...")
425
+ else:
426
+ print(f" {data}")
427
+
428
+ def analyze_and_answer(
429
+ self,
430
+ question: str,
431
+ search_results: List[Dict]
432
+ ) -> Dict:
433
+ """
434
+ Analyze search results and generate intelligent answer
435
+
436
+ Returns:
437
+ {
438
+ 'question': user question,
439
+ 'answer': detailed answer,
440
+ 'sources_used': number of sources,
441
+ 'confidence': low/medium/high,
442
+ 'search_results': original search results
443
+ }
444
+ """
445
+
446
+ print(f"\n{'='*70}")
447
+ print(f"ANALYZING QUESTION & GENERATING ANSWER")
448
+ print(f"{'='*70}")
449
+
450
+ print(f"\n❓ Question: {question}")
451
+ print(f"📊 Search Results Found: {len(search_results)}")
452
+
453
+ # Check if we have search results
454
+ if not search_results:
455
+ print(f"⚠️ No search results found!")
456
+ answer = f"""I could not find relevant information in the document to answer your question: "{question}"
457
+
458
+ Try:
459
+ - Using different keywords
460
+ - Breaking the question into smaller parts
461
+ - Asking about other topics in the document"""
462
+
463
+ result = {
464
+ 'question': question,
465
+ 'answer': answer,
466
+ 'sources_used': 0,
467
+ 'confidence': 'low',
468
+ 'search_results': []
469
+ }
470
+ self.answer_log.append(result)
471
+ return result
472
+
473
+ # Build context from search results
474
+ context_parts = []
475
+ for idx, result in enumerate(search_results, 1):
476
+ content = result.get('content', '')
477
+ metadata = result.get('metadata', {})
478
+ content_type = result.get('type', 'unknown')
479
+ distance = result.get('distance', 0)
480
+ relevance = 1 - distance if distance else 0
481
+
482
+ context_parts.append(f"""
483
+ [Source {idx} - {content_type.upper()} (relevance: {relevance:.1%})]
484
+ {content}""")
485
+
486
+ full_context = "\n".join(context_parts)
487
+
488
+ self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
489
+
490
+ # Build prompt to analyze results and answer question
491
+ analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
492
+
493
+ USER QUESTION:
494
+ "{question}"
495
+
496
+ RELEVANT CONTENT FROM DOCUMENT:
497
+ {full_context}
498
+
499
+ INSTRUCTIONS:
500
+ 1. Analyze the provided content carefully
501
+ 2. Extract information relevant to the question
502
+ 3. Synthesize a clear, comprehensive answer in {self.language}
503
+ 4. If the content doesn't fully answer the question, explain what information is available
504
+ 5. Be specific and cite the content when relevant
505
+ 6. Structure your answer clearly with key points
506
+
507
+ ANSWER:"""
508
+
509
+ print(f"\n🔍 Analyzing search results...")
510
+ print(f" Context size: {len(full_context)} characters")
511
+ print(f" Sources: {len(search_results)}")
512
+
513
+ try:
514
+ # Call LLM to analyze and answer
515
+ message = HumanMessage(content=analysis_prompt)
516
+ response = self.llm.invoke([message])
517
+ answer = response.content.strip()
518
+
519
+ # Determine confidence level
520
+ confidence = self._estimate_confidence(len(search_results), answer)
521
+
522
+ print(f"✅ Answer generated successfully")
523
+ print(f" Confidence: {confidence}")
524
+ print(f" Answer length: {len(answer)} characters")
525
+
526
+ result = {
527
+ 'question': question,
528
+ 'answer': answer,
529
+ 'sources_used': len(search_results),
530
+ 'confidence': confidence,
531
+ 'search_results': search_results
532
+ }
533
+
534
+ self.answer_log.append(result)
535
+ return result
536
+
537
+ except Exception as e:
538
+ print(f"❌ Error generating answer: {e}")
539
+ answer = f"I encountered an error while analyzing the search results. Please try again."
540
+
541
+ result = {
542
+ 'question': question,
543
+ 'answer': answer,
544
+ 'sources_used': len(search_results),
545
+ 'confidence': 'low',
546
+ 'error': str(e),
547
+ 'search_results': search_results
548
+ }
549
+
550
+ self.answer_log.append(result)
551
+ return result
552
+
553
+ def _estimate_confidence(self, sources_count: int, answer: str) -> str:
554
+ """Estimate confidence level of answer"""
555
+ answer_length = len(answer)
556
+
557
+ # High confidence: multiple sources, substantial answer
558
+ if sources_count >= 3 and answer_length > 500:
559
+ return "high"
560
+
561
+ # Medium confidence: some sources, decent answer
562
+ elif sources_count >= 2 and answer_length > 200:
563
+ return "medium"
564
+
565
+ # Low confidence: few sources or short answer
566
+ else:
567
+ return "low"
568
+
569
+ def get_answer_with_sources(
570
+ self,
571
+ question: str,
572
+ search_results: List[Dict]
573
+ ) -> Dict:
574
+ """
575
+ Get answer AND properly formatted sources
576
+ Returns both answer and formatted source citations
577
+ """
578
+
579
+ result = self.analyze_and_answer(question, search_results)
580
+
581
+ # Format sources for display
582
+ formatted_sources = []
583
+ for idx, source in enumerate(result['search_results'], 1):
584
+ formatted_sources.append({
585
+ 'index': idx,
586
+ 'type': source.get('type', 'unknown'),
587
+ 'content': source.get('content', ''),
588
+ 'relevance': 1 - source.get('distance', 0) if source.get('distance') else 0
589
+ })
590
+
591
+ result['formatted_sources'] = formatted_sources
592
+ return result
593
+
594
+ def get_answer_log(self) -> List[Dict]:
595
+ """Get all answer generation logs"""
596
+ return self.answer_log
597
+
598
+ def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
599
+ """Pretty print answer with sources"""
600
+
601
+ print(f"\n{'='*70}")
602
+ print(f"ANSWER TO: {result['question']}")
603
+ print(f"{'='*70}")
604
+
605
+ print(f"\n📝 ANSWER (Confidence: {result['confidence'].upper()}):")
606
+ print(f"{'-'*70}")
607
+ print(result['answer'])
608
+ print(f"{'-'*70}")
609
+
610
+ if result.get('formatted_sources'):
611
+ print(f"\n📚 SOURCES USED ({len(result['formatted_sources'])} total):")
612
+ for source in result['formatted_sources']:
613
+ print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
614
+ print(f"{source['content'][:max_source_length]}...")
615
+
616
+ print(f"\n{'='*70}")