Spaces:
Sleeping
Sleeping
Update src/rag_system.py
Browse files- src/rag_system.py +227 -1
src/rag_system.py
CHANGED
|
@@ -387,4 +387,230 @@ Summary (2-3 sentences maximum):"""
|
|
| 387 |
|
| 388 |
def get_visual_summaries_log(self) -> List[Dict]:
|
| 389 |
"""Get all visual analysis logs"""
|
| 390 |
-
return self.visual_summaries_log
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
def get_visual_summaries_log(self) -> List[Dict]:
|
| 389 |
"""Get all visual analysis logs"""
|
| 390 |
+
return self.visual_summaries_log
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
class AnsweringRAG:
|
| 394 |
+
"""
|
| 395 |
+
RAG system that:
|
| 396 |
+
1. Searches vector store for relevant content
|
| 397 |
+
2. ANALYZES search results
|
| 398 |
+
3. Generates intelligent answers based on context
|
| 399 |
+
"""
|
| 400 |
+
|
| 401 |
+
def __init__(self, api_key: str = None, debug: bool = True):
|
| 402 |
+
api_key = api_key or OPENAI_API_KEY
|
| 403 |
+
self.debug = debug
|
| 404 |
+
|
| 405 |
+
self.llm = ChatOpenAI(
|
| 406 |
+
model_name="gpt-4o", # Use gpt-4o for better understanding
|
| 407 |
+
api_key=api_key,
|
| 408 |
+
temperature=TEMPERATURE,
|
| 409 |
+
max_tokens=MAX_TOKENS,
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
self.language = LANGUAGE
|
| 413 |
+
self.answer_log = []
|
| 414 |
+
|
| 415 |
+
if self.debug:
|
| 416 |
+
print("✅ AnsweringRAG initialized with answer generation")
|
| 417 |
+
|
| 418 |
+
def _debug_print(self, label: str, data: any):
|
| 419 |
+
"""Print debug information"""
|
| 420 |
+
if self.debug:
|
| 421 |
+
print(f"\n🔍 DEBUG [{label}]:")
|
| 422 |
+
if isinstance(data, (list, dict)):
|
| 423 |
+
print(f" Type: {type(data).__name__}")
|
| 424 |
+
print(f" Content: {str(data)[:300]}...")
|
| 425 |
+
else:
|
| 426 |
+
print(f" {data}")
|
| 427 |
+
|
| 428 |
+
def analyze_and_answer(
|
| 429 |
+
self,
|
| 430 |
+
question: str,
|
| 431 |
+
search_results: List[Dict]
|
| 432 |
+
) -> Dict:
|
| 433 |
+
"""
|
| 434 |
+
Analyze search results and generate intelligent answer
|
| 435 |
+
|
| 436 |
+
Returns:
|
| 437 |
+
{
|
| 438 |
+
'question': user question,
|
| 439 |
+
'answer': detailed answer,
|
| 440 |
+
'sources_used': number of sources,
|
| 441 |
+
'confidence': low/medium/high,
|
| 442 |
+
'search_results': original search results
|
| 443 |
+
}
|
| 444 |
+
"""
|
| 445 |
+
|
| 446 |
+
print(f"\n{'='*70}")
|
| 447 |
+
print(f"ANALYZING QUESTION & GENERATING ANSWER")
|
| 448 |
+
print(f"{'='*70}")
|
| 449 |
+
|
| 450 |
+
print(f"\n❓ Question: {question}")
|
| 451 |
+
print(f"📊 Search Results Found: {len(search_results)}")
|
| 452 |
+
|
| 453 |
+
# Check if we have search results
|
| 454 |
+
if not search_results:
|
| 455 |
+
print(f"⚠️ No search results found!")
|
| 456 |
+
answer = f"""I could not find relevant information in the document to answer your question: "{question}"
|
| 457 |
+
|
| 458 |
+
Try:
|
| 459 |
+
- Using different keywords
|
| 460 |
+
- Breaking the question into smaller parts
|
| 461 |
+
- Asking about other topics in the document"""
|
| 462 |
+
|
| 463 |
+
result = {
|
| 464 |
+
'question': question,
|
| 465 |
+
'answer': answer,
|
| 466 |
+
'sources_used': 0,
|
| 467 |
+
'confidence': 'low',
|
| 468 |
+
'search_results': []
|
| 469 |
+
}
|
| 470 |
+
self.answer_log.append(result)
|
| 471 |
+
return result
|
| 472 |
+
|
| 473 |
+
# Build context from search results
|
| 474 |
+
context_parts = []
|
| 475 |
+
for idx, result in enumerate(search_results, 1):
|
| 476 |
+
content = result.get('content', '')
|
| 477 |
+
metadata = result.get('metadata', {})
|
| 478 |
+
content_type = result.get('type', 'unknown')
|
| 479 |
+
distance = result.get('distance', 0)
|
| 480 |
+
relevance = 1 - distance if distance else 0
|
| 481 |
+
|
| 482 |
+
context_parts.append(f"""
|
| 483 |
+
[Source {idx} - {content_type.upper()} (relevance: {relevance:.1%})]
|
| 484 |
+
{content}""")
|
| 485 |
+
|
| 486 |
+
full_context = "\n".join(context_parts)
|
| 487 |
+
|
| 488 |
+
self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
|
| 489 |
+
|
| 490 |
+
# Build prompt to analyze results and answer question
|
| 491 |
+
analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
|
| 492 |
+
|
| 493 |
+
USER QUESTION:
|
| 494 |
+
"{question}"
|
| 495 |
+
|
| 496 |
+
RELEVANT CONTENT FROM DOCUMENT:
|
| 497 |
+
{full_context}
|
| 498 |
+
|
| 499 |
+
INSTRUCTIONS:
|
| 500 |
+
1. Analyze the provided content carefully
|
| 501 |
+
2. Extract information relevant to the question
|
| 502 |
+
3. Synthesize a clear, comprehensive answer in {self.language}
|
| 503 |
+
4. If the content doesn't fully answer the question, explain what information is available
|
| 504 |
+
5. Be specific and cite the content when relevant
|
| 505 |
+
6. Structure your answer clearly with key points
|
| 506 |
+
|
| 507 |
+
ANSWER:"""
|
| 508 |
+
|
| 509 |
+
print(f"\n🔍 Analyzing search results...")
|
| 510 |
+
print(f" Context size: {len(full_context)} characters")
|
| 511 |
+
print(f" Sources: {len(search_results)}")
|
| 512 |
+
|
| 513 |
+
try:
|
| 514 |
+
# Call LLM to analyze and answer
|
| 515 |
+
message = HumanMessage(content=analysis_prompt)
|
| 516 |
+
response = self.llm.invoke([message])
|
| 517 |
+
answer = response.content.strip()
|
| 518 |
+
|
| 519 |
+
# Determine confidence level
|
| 520 |
+
confidence = self._estimate_confidence(len(search_results), answer)
|
| 521 |
+
|
| 522 |
+
print(f"✅ Answer generated successfully")
|
| 523 |
+
print(f" Confidence: {confidence}")
|
| 524 |
+
print(f" Answer length: {len(answer)} characters")
|
| 525 |
+
|
| 526 |
+
result = {
|
| 527 |
+
'question': question,
|
| 528 |
+
'answer': answer,
|
| 529 |
+
'sources_used': len(search_results),
|
| 530 |
+
'confidence': confidence,
|
| 531 |
+
'search_results': search_results
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
self.answer_log.append(result)
|
| 535 |
+
return result
|
| 536 |
+
|
| 537 |
+
except Exception as e:
|
| 538 |
+
print(f"❌ Error generating answer: {e}")
|
| 539 |
+
answer = f"I encountered an error while analyzing the search results. Please try again."
|
| 540 |
+
|
| 541 |
+
result = {
|
| 542 |
+
'question': question,
|
| 543 |
+
'answer': answer,
|
| 544 |
+
'sources_used': len(search_results),
|
| 545 |
+
'confidence': 'low',
|
| 546 |
+
'error': str(e),
|
| 547 |
+
'search_results': search_results
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
self.answer_log.append(result)
|
| 551 |
+
return result
|
| 552 |
+
|
| 553 |
+
def _estimate_confidence(self, sources_count: int, answer: str) -> str:
|
| 554 |
+
"""Estimate confidence level of answer"""
|
| 555 |
+
answer_length = len(answer)
|
| 556 |
+
|
| 557 |
+
# High confidence: multiple sources, substantial answer
|
| 558 |
+
if sources_count >= 3 and answer_length > 500:
|
| 559 |
+
return "high"
|
| 560 |
+
|
| 561 |
+
# Medium confidence: some sources, decent answer
|
| 562 |
+
elif sources_count >= 2 and answer_length > 200:
|
| 563 |
+
return "medium"
|
| 564 |
+
|
| 565 |
+
# Low confidence: few sources or short answer
|
| 566 |
+
else:
|
| 567 |
+
return "low"
|
| 568 |
+
|
| 569 |
+
def get_answer_with_sources(
|
| 570 |
+
self,
|
| 571 |
+
question: str,
|
| 572 |
+
search_results: List[Dict]
|
| 573 |
+
) -> Dict:
|
| 574 |
+
"""
|
| 575 |
+
Get answer AND properly formatted sources
|
| 576 |
+
Returns both answer and formatted source citations
|
| 577 |
+
"""
|
| 578 |
+
|
| 579 |
+
result = self.analyze_and_answer(question, search_results)
|
| 580 |
+
|
| 581 |
+
# Format sources for display
|
| 582 |
+
formatted_sources = []
|
| 583 |
+
for idx, source in enumerate(result['search_results'], 1):
|
| 584 |
+
formatted_sources.append({
|
| 585 |
+
'index': idx,
|
| 586 |
+
'type': source.get('type', 'unknown'),
|
| 587 |
+
'content': source.get('content', ''),
|
| 588 |
+
'relevance': 1 - source.get('distance', 0) if source.get('distance') else 0
|
| 589 |
+
})
|
| 590 |
+
|
| 591 |
+
result['formatted_sources'] = formatted_sources
|
| 592 |
+
return result
|
| 593 |
+
|
| 594 |
+
def get_answer_log(self) -> List[Dict]:
|
| 595 |
+
"""Get all answer generation logs"""
|
| 596 |
+
return self.answer_log
|
| 597 |
+
|
| 598 |
+
def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
|
| 599 |
+
"""Pretty print answer with sources"""
|
| 600 |
+
|
| 601 |
+
print(f"\n{'='*70}")
|
| 602 |
+
print(f"ANSWER TO: {result['question']}")
|
| 603 |
+
print(f"{'='*70}")
|
| 604 |
+
|
| 605 |
+
print(f"\n📝 ANSWER (Confidence: {result['confidence'].upper()}):")
|
| 606 |
+
print(f"{'-'*70}")
|
| 607 |
+
print(result['answer'])
|
| 608 |
+
print(f"{'-'*70}")
|
| 609 |
+
|
| 610 |
+
if result.get('formatted_sources'):
|
| 611 |
+
print(f"\n📚 SOURCES USED ({len(result['formatted_sources'])} total):")
|
| 612 |
+
for source in result['formatted_sources']:
|
| 613 |
+
print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
|
| 614 |
+
print(f"{source['content'][:max_source_length]}...")
|
| 615 |
+
|
| 616 |
+
print(f"\n{'='*70}")
|