Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -581,7 +581,7 @@ class EmbeddingFilter:
|
|
| 581 |
return search_results
|
| 582 |
|
| 583 |
class LLMSummarizer:
|
| 584 |
-
"""
|
| 585 |
|
| 586 |
def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
|
| 587 |
self.groq_api_key = groq_api_key
|
|
@@ -591,48 +591,132 @@ class LLMSummarizer:
|
|
| 591 |
|
| 592 |
def create_system_prompt(self) -> str:
|
| 593 |
"""Create system prompt for summarization"""
|
| 594 |
-
return """You are an expert
|
| 595 |
|
| 596 |
-
|
| 597 |
-
1.
|
| 598 |
-
2.
|
| 599 |
-
3.
|
| 600 |
-
4.
|
| 601 |
-
5.
|
| 602 |
-
6.
|
| 603 |
-
7.
|
|
|
|
|
|
|
|
|
|
| 604 |
|
| 605 |
Format your response as a comprehensive summary, not bullet points."""
|
| 606 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
|
| 608 |
temperature: float = 0.3, max_tokens: int = 2000) -> str:
|
| 609 |
-
"""
|
| 610 |
if not self.groq_api_key:
|
| 611 |
return "Groq API key not provided"
|
| 612 |
|
| 613 |
try:
|
| 614 |
-
# Prepare
|
| 615 |
-
|
| 616 |
-
"user_query": query,
|
| 617 |
-
"search_results": []
|
| 618 |
-
}
|
| 619 |
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
"snippet": result.snippet,
|
| 625 |
-
"content": result.content[:2000], # Limit content length
|
| 626 |
-
"publication_date": result.publication_date,
|
| 627 |
-
"relevance_score": result.relevance_score
|
| 628 |
-
})
|
| 629 |
|
| 630 |
-
user_prompt = f"""Please
|
| 631 |
|
| 632 |
-
|
| 633 |
-
{json.dumps(content_json, indent=2)}
|
| 634 |
|
| 635 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
headers = {
|
| 638 |
"Authorization": f"Bearer {self.groq_api_key}",
|
|
@@ -646,7 +730,8 @@ Provide a comprehensive summary that directly answers the user's query based on
|
|
| 646 |
{"role": "user", "content": user_prompt}
|
| 647 |
],
|
| 648 |
"temperature": temperature,
|
| 649 |
-
"max_tokens": max_tokens
|
|
|
|
| 650 |
}
|
| 651 |
|
| 652 |
async with aiohttp.ClientSession() as session:
|
|
@@ -654,43 +739,44 @@ Provide a comprehensive summary that directly answers the user's query based on
|
|
| 654 |
headers=headers, json=payload) as response:
|
| 655 |
if response.status == 200:
|
| 656 |
result = await response.json()
|
| 657 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
else:
|
| 659 |
error_text = await response.text()
|
| 660 |
return f"Groq API error: {response.status} - {error_text}"
|
| 661 |
|
| 662 |
except Exception as e:
|
| 663 |
return f"Error with Groq summarization: {str(e)}"
|
| 664 |
-
|
| 665 |
async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
|
| 666 |
temperature: float = 0.3, max_tokens: int = 2000) -> str:
|
| 667 |
-
"""
|
| 668 |
if not self.openrouter_api_key:
|
| 669 |
return "OpenRouter API key not provided"
|
| 670 |
|
| 671 |
try:
|
| 672 |
-
# Prepare
|
| 673 |
-
|
| 674 |
-
"user_query": query,
|
| 675 |
-
"search_results": []
|
| 676 |
-
}
|
| 677 |
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
"snippet": result.snippet,
|
| 683 |
-
"content": result.content[:2000], # Limit content length
|
| 684 |
-
"publication_date": result.publication_date,
|
| 685 |
-
"relevance_score": result.relevance_score
|
| 686 |
-
})
|
| 687 |
|
| 688 |
-
user_prompt = f"""Please
|
| 689 |
|
| 690 |
-
|
| 691 |
-
{json.dumps(content_json, indent=2)}
|
| 692 |
|
| 693 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
|
| 695 |
headers = {
|
| 696 |
"Authorization": f"Bearer {self.openrouter_api_key}",
|
|
@@ -714,7 +800,12 @@ Provide a comprehensive summary that directly answers the user's query based on
|
|
| 714 |
headers=headers, json=payload) as response:
|
| 715 |
if response.status == 200:
|
| 716 |
result = await response.json()
|
| 717 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
else:
|
| 719 |
error_text = await response.text()
|
| 720 |
return f"OpenRouter API error: {response.status} - {error_text}"
|
|
@@ -797,6 +888,13 @@ class AISearchEngine:
|
|
| 797 |
results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
|
| 798 |
status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
|
| 799 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
# If we don't have enough content, try to get some from snippets
|
| 801 |
if len(results_with_content) < 3:
|
| 802 |
status_updates.append("Using search snippets as fallback content...")
|
|
|
|
| 581 |
return search_results
|
| 582 |
|
| 583 |
class LLMSummarizer:
|
| 584 |
+
"""Improved summarizer with better content preparation and validation"""
|
| 585 |
|
| 586 |
def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
|
| 587 |
self.groq_api_key = groq_api_key
|
|
|
|
| 591 |
|
| 592 |
def create_system_prompt(self) -> str:
|
| 593 |
"""Create system prompt for summarization"""
|
| 594 |
+
return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
|
| 595 |
|
| 596 |
+
CRITICAL INSTRUCTIONS:
|
| 597 |
+
1. ONLY use information that is directly relevant to the user's query
|
| 598 |
+
2. If the search results don't contain relevant information, explicitly state this
|
| 599 |
+
3. Don't make up information or provide generic advice
|
| 600 |
+
4. Synthesize information from multiple sources when available
|
| 601 |
+
5. Include specific facts, dates, numbers, and quotes when present
|
| 602 |
+
6. If information is contradictory between sources, mention this
|
| 603 |
+
7. Cite sources by mentioning the publication or website name
|
| 604 |
+
8. Be specific and detailed rather than vague
|
| 605 |
+
|
| 606 |
+
If the search results are not relevant to the query, respond with: "The search results do not contain sufficient relevant information to answer your query about [topic]. The results primarily contained [brief description of what was actually found]."
|
| 607 |
|
| 608 |
Format your response as a comprehensive summary, not bullet points."""
|
| 609 |
+
|
| 610 |
+
def validate_content_quality(self, search_results: List[SearchResult], query: str) -> Tuple[List[SearchResult], str]:
|
| 611 |
+
"""Validate and filter content quality before summarization"""
|
| 612 |
+
valid_results = []
|
| 613 |
+
validation_info = []
|
| 614 |
+
|
| 615 |
+
query_keywords = set(query.lower().split())
|
| 616 |
+
|
| 617 |
+
for result in search_results:
|
| 618 |
+
if not result.content or len(result.content.strip()) < 100:
|
| 619 |
+
validation_info.append(f"Skipped '{result.title}' - insufficient content")
|
| 620 |
+
continue
|
| 621 |
+
|
| 622 |
+
# Check if content contains query-relevant terms
|
| 623 |
+
content_lower = result.content.lower()
|
| 624 |
+
title_lower = result.title.lower()
|
| 625 |
+
snippet_lower = result.snippet.lower()
|
| 626 |
+
|
| 627 |
+
# Count relevant keywords
|
| 628 |
+
relevant_score = 0
|
| 629 |
+
for keyword in query_keywords:
|
| 630 |
+
if len(keyword) > 2: # Skip very short words
|
| 631 |
+
if keyword in content_lower:
|
| 632 |
+
relevant_score += 2
|
| 633 |
+
elif keyword in title_lower:
|
| 634 |
+
relevant_score += 1
|
| 635 |
+
elif keyword in snippet_lower:
|
| 636 |
+
relevant_score += 0.5
|
| 637 |
+
|
| 638 |
+
if relevant_score > 0:
|
| 639 |
+
valid_results.append(result)
|
| 640 |
+
validation_info.append(f"✓ '{result.title}' - relevance score: {relevant_score}")
|
| 641 |
+
else:
|
| 642 |
+
validation_info.append(f"Skipped '{result.title}' - not relevant to query")
|
| 643 |
+
|
| 644 |
+
validation_summary = "\n".join(validation_info)
|
| 645 |
+
return valid_results, validation_summary
|
| 646 |
+
|
| 647 |
+
def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
|
| 648 |
+
"""Prepare well-structured content for LLM"""
|
| 649 |
+
# Validate content first
|
| 650 |
+
valid_results, validation_info = self.validate_content_quality(search_results, query)
|
| 651 |
+
|
| 652 |
+
if not valid_results:
|
| 653 |
+
return f"""Query: "{query}"
|
| 654 |
+
|
| 655 |
+
VALIDATION RESULTS:
|
| 656 |
+
{validation_info}
|
| 657 |
+
|
| 658 |
+
No search results contained relevant content for this query. Please provide a response indicating that insufficient relevant information was found."""
|
| 659 |
+
|
| 660 |
+
content_parts = [f'User Query: "{query}"\n']
|
| 661 |
+
content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")
|
| 662 |
+
|
| 663 |
+
for i, result in enumerate(valid_results, 1):
|
| 664 |
+
content_parts.append(f"=== SOURCE {i} ===")
|
| 665 |
+
content_parts.append(f"Title: {result.title}")
|
| 666 |
+
content_parts.append(f"URL: {result.url}")
|
| 667 |
+
|
| 668 |
+
if result.publication_date:
|
| 669 |
+
content_parts.append(f"Date: {result.publication_date}")
|
| 670 |
+
|
| 671 |
+
if result.relevance_score > 0:
|
| 672 |
+
content_parts.append(f"Relevance Score: {result.relevance_score:.3f}")
|
| 673 |
+
|
| 674 |
+
# Include snippet if it's different from content start
|
| 675 |
+
if result.snippet and not result.content.startswith(result.snippet[:50]):
|
| 676 |
+
content_parts.append(f"Snippet: {result.snippet}")
|
| 677 |
+
|
| 678 |
+
# Intelligently truncate content while preserving meaning
|
| 679 |
+
content = result.content.strip()
|
| 680 |
+
if len(content) > 3000:
|
| 681 |
+
# Try to find a good breaking point
|
| 682 |
+
truncate_at = 3000
|
| 683 |
+
# Look for sentence endings near the truncation point
|
| 684 |
+
for i in range(2800, 3200):
|
| 685 |
+
if i < len(content) and content[i] in '.!?':
|
| 686 |
+
truncate_at = i + 1
|
| 687 |
+
break
|
| 688 |
+
content = content[:truncate_at] + "... [content truncated]"
|
| 689 |
+
|
| 690 |
+
content_parts.append(f"Content: {content}")
|
| 691 |
+
content_parts.append("") # Empty line between sources
|
| 692 |
+
|
| 693 |
+
return "\n".join(content_parts)
|
| 694 |
+
|
| 695 |
async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
|
| 696 |
temperature: float = 0.3, max_tokens: int = 2000) -> str:
|
| 697 |
+
"""Improved Groq summarization with better content preparation"""
|
| 698 |
if not self.groq_api_key:
|
| 699 |
return "Groq API key not provided"
|
| 700 |
|
| 701 |
try:
|
| 702 |
+
# Prepare well-structured content
|
| 703 |
+
prepared_content = self.prepare_content_for_llm(query, search_results)
|
|
|
|
|
|
|
|
|
|
| 704 |
|
| 705 |
+
# Debug output
|
| 706 |
+
print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
|
| 707 |
+
print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
|
| 708 |
+
print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
|
| 710 |
+
user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
|
| 711 |
|
| 712 |
+
{prepared_content}
|
|
|
|
| 713 |
|
| 714 |
+
Instructions:
|
| 715 |
+
- Focus ONLY on information relevant to the query: "{query}"
|
| 716 |
+
- If the results don't contain relevant information, explicitly state this
|
| 717 |
+
- Be specific and factual, include dates/numbers when available
|
| 718 |
+
- Mention source publications when referencing information
|
| 719 |
+
- Don't provide generic advice if specific information isn't found"""
|
| 720 |
|
| 721 |
headers = {
|
| 722 |
"Authorization": f"Bearer {self.groq_api_key}",
|
|
|
|
| 730 |
{"role": "user", "content": user_prompt}
|
| 731 |
],
|
| 732 |
"temperature": temperature,
|
| 733 |
+
"max_tokens": max_tokens,
|
| 734 |
+
"stream": False
|
| 735 |
}
|
| 736 |
|
| 737 |
async with aiohttp.ClientSession() as session:
|
|
|
|
| 739 |
headers=headers, json=payload) as response:
|
| 740 |
if response.status == 200:
|
| 741 |
result = await response.json()
|
| 742 |
+
summary = result["choices"][0]["message"]["content"]
|
| 743 |
+
|
| 744 |
+
# Add debug info in development
|
| 745 |
+
debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
|
| 746 |
+
return summary + debug_info
|
| 747 |
+
|
| 748 |
else:
|
| 749 |
error_text = await response.text()
|
| 750 |
return f"Groq API error: {response.status} - {error_text}"
|
| 751 |
|
| 752 |
except Exception as e:
|
| 753 |
return f"Error with Groq summarization: {str(e)}"
|
| 754 |
+
|
| 755 |
async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
|
| 756 |
temperature: float = 0.3, max_tokens: int = 2000) -> str:
|
| 757 |
+
"""Improved OpenRouter summarization with better content preparation"""
|
| 758 |
if not self.openrouter_api_key:
|
| 759 |
return "OpenRouter API key not provided"
|
| 760 |
|
| 761 |
try:
|
| 762 |
+
# Prepare well-structured content
|
| 763 |
+
prepared_content = self.prepare_content_for_llm(query, search_results)
|
|
|
|
|
|
|
|
|
|
| 764 |
|
| 765 |
+
# Debug output
|
| 766 |
+
print(f"DEBUG - Sending {len(prepared_content)} characters to OpenRouter AI")
|
| 767 |
+
print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
|
| 768 |
+
print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 769 |
|
| 770 |
+
user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
|
| 771 |
|
| 772 |
+
{prepared_content}
|
|
|
|
| 773 |
|
| 774 |
+
Instructions:
|
| 775 |
+
- Focus ONLY on information relevant to the query: "{query}"
|
| 776 |
+
- If the results don't contain relevant information, explicitly state this
|
| 777 |
+
- Be specific and factual, include dates/numbers when available
|
| 778 |
+
- Mention source publications when referencing information
|
| 779 |
+
- Don't provide generic advice if specific information isn't found"""
|
| 780 |
|
| 781 |
headers = {
|
| 782 |
"Authorization": f"Bearer {self.openrouter_api_key}",
|
|
|
|
| 800 |
headers=headers, json=payload) as response:
|
| 801 |
if response.status == 200:
|
| 802 |
result = await response.json()
|
| 803 |
+
summary = result["choices"][0]["message"]["content"]
|
| 804 |
+
|
| 805 |
+
# Add debug info in development
|
| 806 |
+
debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
|
| 807 |
+
return summary + debug_info
|
| 808 |
+
|
| 809 |
else:
|
| 810 |
error_text = await response.text()
|
| 811 |
return f"OpenRouter API error: {response.status} - {error_text}"
|
|
|
|
| 888 |
results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
|
| 889 |
status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
|
| 890 |
|
| 891 |
+
# Debug: Show what content we actually got
|
| 892 |
+
for i, result in enumerate(results_with_content[:3]):
|
| 893 |
+
print(f"Result {i+1}: {result.title}")
|
| 894 |
+
print(f"Content length: {len(result.content)}")
|
| 895 |
+
print(f"Content preview: {result.content[:200]}...")
|
| 896 |
+
print("---")
|
| 897 |
+
|
| 898 |
# If we don't have enough content, try to get some from snippets
|
| 899 |
if len(results_with_content) < 3:
|
| 900 |
status_updates.append("Using search snippets as fallback content...")
|