Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -399,12 +399,10 @@ class ContentScraper:
|
|
| 399 |
except:
|
| 400 |
continue
|
| 401 |
|
| 402 |
-
#
|
| 403 |
if content:
|
| 404 |
# Remove excessive whitespace
|
| 405 |
content = ' '.join(content.split())
|
| 406 |
-
# Limit length
|
| 407 |
-
content = content[:3000]
|
| 408 |
|
| 409 |
return content, pub_date
|
| 410 |
|
|
@@ -424,7 +422,7 @@ class ContentScraper:
|
|
| 424 |
article.parse()
|
| 425 |
|
| 426 |
if article.text and len(article.text.strip()) > 100:
|
| 427 |
-
content = article.text.strip()
|
| 428 |
pub_date = article.publish_date.isoformat() if article.publish_date else None
|
| 429 |
return content, pub_date
|
| 430 |
|
|
@@ -581,7 +579,7 @@ class EmbeddingFilter:
|
|
| 581 |
return search_results
|
| 582 |
|
| 583 |
class LLMSummarizer:
|
| 584 |
-
"""Improved summarizer
|
| 585 |
|
| 586 |
def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
|
| 587 |
self.groq_api_key = groq_api_key
|
|
@@ -594,118 +592,30 @@ class LLMSummarizer:
|
|
| 594 |
return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
|
| 595 |
|
| 596 |
CRITICAL INSTRUCTIONS:
|
| 597 |
-
1. Analyze ALL provided content carefully
|
| 598 |
-
2.
|
| 599 |
-
3.
|
| 600 |
-
4.
|
| 601 |
-
5.
|
| 602 |
-
6.
|
| 603 |
-
7.
|
| 604 |
-
8.
|
| 605 |
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
Format your response as a comprehensive summary, not bullet points."""
|
| 609 |
-
|
| 610 |
-
def validate_content_quality(self, search_results: List[SearchResult], query: str) -> Tuple[List[SearchResult], str]:
|
| 611 |
-
"""Validate and filter content quality before summarization"""
|
| 612 |
-
valid_results = []
|
| 613 |
-
validation_info = []
|
| 614 |
-
|
| 615 |
-
# More intelligent keyword extraction
|
| 616 |
-
query_lower = query.lower()
|
| 617 |
-
|
| 618 |
-
# Extract key entities and terms
|
| 619 |
-
important_keywords = []
|
| 620 |
-
|
| 621 |
-
# Split query into words and extract meaningful terms
|
| 622 |
-
words = query_lower.split()
|
| 623 |
-
for word in words:
|
| 624 |
-
if len(word) > 2 and word not in ['news', 'latest', 'recent', 'update', 'information', 'about']:
|
| 625 |
-
important_keywords.append(word)
|
| 626 |
-
|
| 627 |
-
# Also look for multi-word entities (like company names)
|
| 628 |
-
# Extract potential company/entity names from query
|
| 629 |
-
entity_patterns = [
|
| 630 |
-
r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper names
|
| 631 |
-
r'\b[A-Z]{2,}(?:\s+[A-Z][a-z]+)*\b', # Acronyms
|
| 632 |
-
]
|
| 633 |
-
|
| 634 |
-
for pattern in entity_patterns:
|
| 635 |
-
matches = re.findall(pattern, query)
|
| 636 |
-
for match in matches:
|
| 637 |
-
important_keywords.extend(match.lower().split())
|
| 638 |
-
|
| 639 |
-
# Remove duplicates
|
| 640 |
-
important_keywords = list(set(important_keywords))
|
| 641 |
-
|
| 642 |
-
for result in search_results:
|
| 643 |
-
if not result.content or len(result.content.strip()) < 50: # Lowered threshold
|
| 644 |
-
validation_info.append(f"Skipped '{result.title}' - insufficient content")
|
| 645 |
-
continue
|
| 646 |
-
|
| 647 |
-
# Check if content contains query-relevant terms
|
| 648 |
-
content_lower = result.content.lower()
|
| 649 |
-
title_lower = result.title.lower()
|
| 650 |
-
snippet_lower = result.snippet.lower()
|
| 651 |
-
combined_text = f"{title_lower} {snippet_lower} {content_lower}"
|
| 652 |
-
|
| 653 |
-
# More flexible relevance scoring
|
| 654 |
-
relevant_score = 0
|
| 655 |
-
matched_keywords = []
|
| 656 |
-
|
| 657 |
-
for keyword in important_keywords:
|
| 658 |
-
if keyword in combined_text:
|
| 659 |
-
if keyword in content_lower:
|
| 660 |
-
relevant_score += 2
|
| 661 |
-
matched_keywords.append(keyword)
|
| 662 |
-
elif keyword in title_lower:
|
| 663 |
-
relevant_score += 3 # Title matches are very important
|
| 664 |
-
matched_keywords.append(keyword)
|
| 665 |
-
elif keyword in snippet_lower:
|
| 666 |
-
relevant_score += 1
|
| 667 |
-
matched_keywords.append(keyword)
|
| 668 |
-
|
| 669 |
-
# Special handling for acronyms and company names
|
| 670 |
-
# If query contains a company acronym (like KKR), be more lenient
|
| 671 |
-
has_company_match = any(len(kw) <= 4 and kw.isupper() for kw in query.split())
|
| 672 |
-
if has_company_match:
|
| 673 |
-
relevant_score += 1 # Boost score for company-related queries
|
| 674 |
-
|
| 675 |
-
# Lower the threshold and accept more results
|
| 676 |
-
if relevant_score >= 1 or len(matched_keywords) >= 1:
|
| 677 |
-
valid_results.append(result)
|
| 678 |
-
validation_info.append(f"β '{result.title}' - score: {relevant_score}, matched: {matched_keywords}")
|
| 679 |
-
else:
|
| 680 |
-
validation_info.append(f"Skipped '{result.title}' - no relevant keywords found")
|
| 681 |
-
|
| 682 |
-
# If we filtered out too many results, be more lenient
|
| 683 |
-
if len(valid_results) < len(search_results) * 0.3: # If we filtered out more than 70%
|
| 684 |
-
validation_info.append("β οΈ Too many results filtered, being more lenient...")
|
| 685 |
-
# Add back results that have any content
|
| 686 |
-
for result in search_results:
|
| 687 |
-
if result not in valid_results and result.content.strip():
|
| 688 |
-
valid_results.append(result)
|
| 689 |
-
validation_info.append(f"β '{result.title}' - added back (lenient mode)")
|
| 690 |
-
|
| 691 |
-
validation_summary = "\n".join(validation_info)
|
| 692 |
-
return valid_results, validation_summary
|
| 693 |
|
| 694 |
def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
|
| 695 |
-
"""Prepare
|
| 696 |
-
|
| 697 |
-
|
|
|
|
| 698 |
|
| 699 |
if not valid_results:
|
| 700 |
return f"""Query: "{query}"
|
| 701 |
|
| 702 |
-
|
| 703 |
-
{validation_info}
|
| 704 |
-
|
| 705 |
-
The search results did not pass the initial relevance filter, but this might be overly restrictive. Please analyze the raw content provided and extract any information that could be relevant to answering the user's query, even if the connection is not immediately obvious."""
|
| 706 |
|
| 707 |
content_parts = [f'User Query: "{query}"\n']
|
| 708 |
-
content_parts.append(f"Number of
|
| 709 |
|
| 710 |
for i, result in enumerate(valid_results, 1):
|
| 711 |
content_parts.append(f"=== SOURCE {i} ===")
|
|
@@ -722,108 +632,39 @@ The search results did not pass the initial relevance filter, but this might be
|
|
| 722 |
if result.snippet and not result.content.startswith(result.snippet[:50]):
|
| 723 |
content_parts.append(f"Snippet: {result.snippet}")
|
| 724 |
|
| 725 |
-
#
|
| 726 |
content = result.content.strip()
|
| 727 |
-
if len(content) > 3000:
|
| 728 |
-
# Try to find a good breaking point
|
| 729 |
-
truncate_at = 3000
|
| 730 |
-
# Look for sentence endings near the truncation point
|
| 731 |
-
for i in range(2800, 3200):
|
| 732 |
-
if i < len(content) and content[i] in '.!?':
|
| 733 |
-
truncate_at = i + 1
|
| 734 |
-
break
|
| 735 |
-
content = content[:truncate_at] + "... [content truncated]"
|
| 736 |
-
|
| 737 |
content_parts.append(f"Content: {content}")
|
| 738 |
content_parts.append("") # Empty line between sources
|
| 739 |
|
| 740 |
return "\n".join(content_parts)
|
| 741 |
|
| 742 |
async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
|
| 743 |
-
temperature: float = 0.3, max_tokens: int =
|
| 744 |
-
"""
|
| 745 |
if not self.groq_api_key:
|
| 746 |
return "Groq API key not provided"
|
| 747 |
|
| 748 |
try:
|
| 749 |
-
# Prepare
|
| 750 |
prepared_content = self.prepare_content_for_llm(query, search_results)
|
| 751 |
|
| 752 |
# Debug output
|
| 753 |
print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
|
| 754 |
print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
|
| 755 |
-
print(f"DEBUG -
|
| 756 |
-
|
| 757 |
-
user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
|
| 758 |
-
|
| 759 |
-
{prepared_content}
|
| 760 |
-
|
| 761 |
-
Instructions:
|
| 762 |
-
- Focus ONLY on information relevant to the query: "{query}"
|
| 763 |
-
- If the results don't contain relevant information, explicitly state this
|
| 764 |
-
- Be specific and factual, include dates/numbers when available
|
| 765 |
-
- Mention source publications when referencing information
|
| 766 |
-
- Don't provide generic advice if specific information isn't found"""
|
| 767 |
-
|
| 768 |
-
headers = {
|
| 769 |
-
"Authorization": f"Bearer {self.groq_api_key}",
|
| 770 |
-
"Content-Type": "application/json"
|
| 771 |
-
}
|
| 772 |
-
|
| 773 |
-
payload = {
|
| 774 |
-
"model": self.groq_model,
|
| 775 |
-
"messages": [
|
| 776 |
-
{"role": "system", "content": self.create_system_prompt()},
|
| 777 |
-
{"role": "user", "content": user_prompt}
|
| 778 |
-
],
|
| 779 |
-
"temperature": temperature,
|
| 780 |
-
"max_tokens": max_tokens,
|
| 781 |
-
"stream": False
|
| 782 |
-
}
|
| 783 |
-
|
| 784 |
-
async with aiohttp.ClientSession() as session:
|
| 785 |
-
async with session.post("https://api.groq.com/openai/v1/chat/completions",
|
| 786 |
-
headers=headers, json=payload) as response:
|
| 787 |
-
if response.status == 200:
|
| 788 |
-
result = await response.json()
|
| 789 |
-
summary = result["choices"][0]["message"]["content"]
|
| 790 |
-
|
| 791 |
-
# Add debug info in development
|
| 792 |
-
debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
|
| 793 |
-
return summary + debug_info
|
| 794 |
-
|
| 795 |
-
else:
|
| 796 |
-
error_text = await response.text()
|
| 797 |
-
return f"Groq API error: {response.status} - {error_text}"
|
| 798 |
-
|
| 799 |
-
except Exception as e:
|
| 800 |
-
return f"Error with Groq summarization: {str(e)}"
|
| 801 |
-
|
| 802 |
-
async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
|
| 803 |
-
temperature: float = 0.3, max_tokens: int = 2000) -> str:
|
| 804 |
-
"""Improved OpenRouter summarization with better content preparation"""
|
| 805 |
-
if not self.openrouter_api_key:
|
| 806 |
-
return "OpenRouter API key not provided"
|
| 807 |
-
|
| 808 |
-
try:
|
| 809 |
-
# Prepare well-structured content
|
| 810 |
-
prepared_content = self.prepare_content_for_llm(query, search_results)
|
| 811 |
-
|
| 812 |
-
# Debug output
|
| 813 |
-
print(f"DEBUG - Sending {len(prepared_content)} characters to OpenRouter AI")
|
| 814 |
-
print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
|
| 815 |
-
print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
|
| 816 |
|
| 817 |
user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
|
| 818 |
|
| 819 |
{prepared_content}
|
| 820 |
|
| 821 |
Instructions:
|
| 822 |
-
- Focus
|
| 823 |
-
-
|
| 824 |
- Be specific and factual, include dates/numbers when available
|
| 825 |
- Mention source publications when referencing information
|
| 826 |
-
-
|
|
|
|
| 827 |
|
| 828 |
headers = {
|
| 829 |
"Authorization": f"Bearer {self.openrouter_api_key}",
|
|
@@ -849,8 +690,8 @@ Instructions:
|
|
| 849 |
result = await response.json()
|
| 850 |
summary = result["choices"][0]["message"]["content"]
|
| 851 |
|
| 852 |
-
# Add debug info
|
| 853 |
-
debug_info = f"\n\n[
|
| 854 |
return summary + debug_info
|
| 855 |
|
| 856 |
else:
|
|
@@ -931,9 +772,9 @@ class AISearchEngine:
|
|
| 931 |
max_successful=target_successful
|
| 932 |
)
|
| 933 |
|
| 934 |
-
#
|
| 935 |
-
results_with_content = [r for r in scraped_results if r.content.strip()
|
| 936 |
-
status_updates.append(f"Successfully scraped {len(results_with_content)} articles with
|
| 937 |
|
| 938 |
# Debug: Show what content we actually got
|
| 939 |
for i, result in enumerate(results_with_content[:3]):
|
|
@@ -971,8 +812,8 @@ class AISearchEngine:
|
|
| 971 |
if not results_with_content:
|
| 972 |
return "No relevant results found after filtering", "\n".join(status_updates)
|
| 973 |
|
| 974 |
-
# Step 5: LLM Summarization
|
| 975 |
-
status_updates.append(f"π€ Generating summary using {model}...")
|
| 976 |
|
| 977 |
try:
|
| 978 |
if model.startswith("Groq"):
|
|
@@ -1006,6 +847,7 @@ class AISearchEngine:
|
|
| 1006 |
metadata += f"- Search engines: {', '.join(search_engines)}\n"
|
| 1007 |
metadata += f"- Model: {model}\n"
|
| 1008 |
metadata += f"- Embeddings used: {use_embeddings}\n"
|
|
|
|
| 1009 |
|
| 1010 |
final_summary = summary + metadata
|
| 1011 |
status_updates.append(f"β
Summary generated in {processing_time:.2f}s")
|
|
@@ -1111,7 +953,7 @@ async def chat_inference(message, history, groq_key, openrouter_key, model_choic
|
|
| 1111 |
yield "π§ Filtering results using embeddings..."
|
| 1112 |
await asyncio.sleep(0.1)
|
| 1113 |
|
| 1114 |
-
yield "π€ Generating AI-powered summary..."
|
| 1115 |
await asyncio.sleep(0.1)
|
| 1116 |
|
| 1117 |
# Perform the actual search and summarization
|
|
@@ -1182,12 +1024,12 @@ def create_gradio_interface():
|
|
| 1182 |
info="Number of search results to fetch from each engine"
|
| 1183 |
),
|
| 1184 |
gr.Slider(
|
| 1185 |
-
minimum=
|
| 1186 |
-
maximum=
|
| 1187 |
-
value=
|
| 1188 |
-
step=
|
| 1189 |
-
label="π Max Tokens",
|
| 1190 |
-
info="Maximum length of the AI-generated summary"
|
| 1191 |
)
|
| 1192 |
]
|
| 1193 |
|
|
@@ -1196,26 +1038,23 @@ def create_gradio_interface():
|
|
| 1196 |
fn=chat_inference,
|
| 1197 |
additional_inputs=additional_inputs,
|
| 1198 |
additional_inputs_accordion=gr.Accordion("βοΈ Configuration & Advanced Parameters", open=True),
|
| 1199 |
-
title="π AI-Powered Search Engine",
|
| 1200 |
description="""
|
| 1201 |
**Search across Google, Bing, and Yahoo, then get AI-powered summaries!**
|
| 1202 |
|
| 1203 |
β¨ **Features:** Multi-engine search β’ Query enhancement β’ Parallel scraping β’ AI summarization β’ Embedding filtering
|
|
|
|
| 1204 |
|
| 1205 |
π **Quick Start:** 1) Add your API key below 2) Select search engines 3) Ask any question!
|
| 1206 |
""",
|
| 1207 |
cache_examples=False,
|
| 1208 |
-
#retry_btn="π Retry",
|
| 1209 |
-
#undo_btn="β©οΈ Undo",
|
| 1210 |
-
#clear_btn="ποΈ Clear",
|
| 1211 |
submit_btn="π Search & Summarize",
|
| 1212 |
stop_btn="βΉοΈ Stop",
|
| 1213 |
chatbot=gr.Chatbot(
|
| 1214 |
show_copy_button=True,
|
| 1215 |
-
#likeable=True,
|
| 1216 |
layout="bubble",
|
| 1217 |
height=600,
|
| 1218 |
-
placeholder="π Ready to search!
|
| 1219 |
show_share_button=True
|
| 1220 |
),
|
| 1221 |
theme=gr.themes.Soft(),
|
|
@@ -1227,4 +1066,4 @@ def create_gradio_interface():
|
|
| 1227 |
|
| 1228 |
if __name__ == "__main__":
|
| 1229 |
demo = create_gradio_interface()
|
| 1230 |
-
demo.launch(share=True)
|
|
|
|
| 399 |
except:
|
| 400 |
continue
|
| 401 |
|
| 402 |
+
# Don't limit content length here - let LLM handle full content
|
| 403 |
if content:
|
| 404 |
# Remove excessive whitespace
|
| 405 |
content = ' '.join(content.split())
|
|
|
|
|
|
|
| 406 |
|
| 407 |
return content, pub_date
|
| 408 |
|
|
|
|
| 422 |
article.parse()
|
| 423 |
|
| 424 |
if article.text and len(article.text.strip()) > 100:
|
| 425 |
+
content = article.text.strip() # Don't limit content length
|
| 426 |
pub_date = article.publish_date.isoformat() if article.publish_date else None
|
| 427 |
return content, pub_date
|
| 428 |
|
|
|
|
| 579 |
return search_results
|
| 580 |
|
| 581 |
class LLMSummarizer:
|
| 582 |
+
"""Improved summarizer without content validation filtering - sends all scraped content to LLM"""
|
| 583 |
|
| 584 |
def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
|
| 585 |
self.groq_api_key = groq_api_key
|
|
|
|
| 592 |
return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
|
| 593 |
|
| 594 |
CRITICAL INSTRUCTIONS:
|
| 595 |
+
1. Analyze ALL provided content carefully and thoroughly
|
| 596 |
+
2. Extract and synthesize any information relevant to answering the user's question
|
| 597 |
+
3. Include specific facts, dates, numbers, and quotes when present
|
| 598 |
+
4. If information is contradictory between sources, mention this
|
| 599 |
+
5. Cite sources by mentioning the publication or website name
|
| 600 |
+
6. Be thorough and detailed in your analysis
|
| 601 |
+
7. If some content seems tangentially related, still include relevant portions
|
| 602 |
+
8. Focus on directly answering the user's query with the most relevant information first
|
| 603 |
|
| 604 |
+
Format your response as a comprehensive summary, not bullet points. Provide a thorough analysis of all the content provided."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
|
| 606 |
def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
|
| 607 |
+
"""Prepare content for LLM without validation filtering - include ALL scraped content"""
|
| 608 |
+
|
| 609 |
+
# No content validation - include all results that have any content
|
| 610 |
+
valid_results = [result for result in search_results if result.content.strip()]
|
| 611 |
|
| 612 |
if not valid_results:
|
| 613 |
return f"""Query: "{query}"
|
| 614 |
|
| 615 |
+
No content was successfully scraped from the search results. This might be due to anti-bot protections or network issues."""
|
|
|
|
|
|
|
|
|
|
| 616 |
|
| 617 |
content_parts = [f'User Query: "{query}"\n']
|
| 618 |
+
content_parts.append(f"Number of sources with content: {len(valid_results)}\n")
|
| 619 |
|
| 620 |
for i, result in enumerate(valid_results, 1):
|
| 621 |
content_parts.append(f"=== SOURCE {i} ===")
|
|
|
|
| 632 |
if result.snippet and not result.content.startswith(result.snippet[:50]):
|
| 633 |
content_parts.append(f"Snippet: {result.snippet}")
|
| 634 |
|
| 635 |
+
# Include FULL content without truncation - let the LLM handle the large context
|
| 636 |
content = result.content.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
content_parts.append(f"Content: {content}")
|
| 638 |
content_parts.append("") # Empty line between sources
|
| 639 |
|
| 640 |
return "\n".join(content_parts)
|
| 641 |
|
| 642 |
async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
|
| 643 |
+
temperature: float = 0.3, max_tokens: int = 8000) -> str:
|
| 644 |
+
"""Enhanced Groq summarization with increased token limits and no content filtering"""
|
| 645 |
if not self.groq_api_key:
|
| 646 |
return "Groq API key not provided"
|
| 647 |
|
| 648 |
try:
|
| 649 |
+
# Prepare content without validation filtering
|
| 650 |
prepared_content = self.prepare_content_for_llm(query, search_results)
|
| 651 |
|
| 652 |
# Debug output
|
| 653 |
print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
|
| 654 |
print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
|
| 655 |
+
print(f"DEBUG - Max completion tokens: {max_tokens}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
|
| 657 |
user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
|
| 658 |
|
| 659 |
{prepared_content}
|
| 660 |
|
| 661 |
Instructions:
|
| 662 |
+
- Focus on information relevant to the query: "{query}"
|
| 663 |
+
- Analyze ALL provided content thoroughly
|
| 664 |
- Be specific and factual, include dates/numbers when available
|
| 665 |
- Mention source publications when referencing information
|
| 666 |
+
- If results contain limited relevant information, state this clearly but still extract what you can
|
| 667 |
+
- Provide a comprehensive analysis of all available content"""
|
| 668 |
|
| 669 |
headers = {
|
| 670 |
"Authorization": f"Bearer {self.openrouter_api_key}",
|
|
|
|
| 690 |
result = await response.json()
|
| 691 |
summary = result["choices"][0]["message"]["content"]
|
| 692 |
|
| 693 |
+
# Add debug info
|
| 694 |
+
debug_info = f"\n\n[Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
|
| 695 |
return summary + debug_info
|
| 696 |
|
| 697 |
else:
|
|
|
|
| 772 |
max_successful=target_successful
|
| 773 |
)
|
| 774 |
|
| 775 |
+
# Include ALL results with any content (no filtering)
|
| 776 |
+
results_with_content = [r for r in scraped_results if r.content.strip()]
|
| 777 |
+
status_updates.append(f"Successfully scraped {len(results_with_content)} articles with content")
|
| 778 |
|
| 779 |
# Debug: Show what content we actually got
|
| 780 |
for i, result in enumerate(results_with_content[:3]):
|
|
|
|
| 812 |
if not results_with_content:
|
| 813 |
return "No relevant results found after filtering", "\n".join(status_updates)
|
| 814 |
|
| 815 |
+
# Step 5: LLM Summarization - now sends ALL content without validation filtering
|
| 816 |
+
status_updates.append(f"π€ Generating summary using {model} (processing all scraped content)...")
|
| 817 |
|
| 818 |
try:
|
| 819 |
if model.startswith("Groq"):
|
|
|
|
| 847 |
metadata += f"- Search engines: {', '.join(search_engines)}\n"
|
| 848 |
metadata += f"- Model: {model}\n"
|
| 849 |
metadata += f"- Embeddings used: {use_embeddings}\n"
|
| 850 |
+
metadata += f"- Content filtering: DISABLED (all content sent to LLM)\n"
|
| 851 |
|
| 852 |
final_summary = summary + metadata
|
| 853 |
status_updates.append(f"β
Summary generated in {processing_time:.2f}s")
|
|
|
|
| 953 |
yield "π§ Filtering results using embeddings..."
|
| 954 |
await asyncio.sleep(0.1)
|
| 955 |
|
| 956 |
+
yield "π€ Generating AI-powered summary (processing all scraped content)..."
|
| 957 |
await asyncio.sleep(0.1)
|
| 958 |
|
| 959 |
# Perform the actual search and summarization
|
|
|
|
| 1024 |
info="Number of search results to fetch from each engine"
|
| 1025 |
),
|
| 1026 |
gr.Slider(
|
| 1027 |
+
minimum=1000,
|
| 1028 |
+
maximum=8000,
|
| 1029 |
+
value=8000,
|
| 1030 |
+
step=500,
|
| 1031 |
+
label="π Max Completion Tokens",
|
| 1032 |
+
info="Maximum length of the AI-generated summary (Groq: up to 8000, OpenRouter: up to 4000)"
|
| 1033 |
)
|
| 1034 |
]
|
| 1035 |
|
|
|
|
| 1038 |
fn=chat_inference,
|
| 1039 |
additional_inputs=additional_inputs,
|
| 1040 |
additional_inputs_accordion=gr.Accordion("βοΈ Configuration & Advanced Parameters", open=True),
|
| 1041 |
+
title="π AI-Powered Search Engine - No Content Filtering",
|
| 1042 |
description="""
|
| 1043 |
**Search across Google, Bing, and Yahoo, then get AI-powered summaries!**
|
| 1044 |
|
| 1045 |
β¨ **Features:** Multi-engine search β’ Query enhancement β’ Parallel scraping β’ AI summarization β’ Embedding filtering
|
| 1046 |
+
π **Updated:** All scraped content is now sent to the LLM without filtering β’ Increased Groq token limits (up to 8K)
|
| 1047 |
|
| 1048 |
π **Quick Start:** 1) Add your API key below 2) Select search engines 3) Ask any question!
|
| 1049 |
""",
|
| 1050 |
cache_examples=False,
|
|
|
|
|
|
|
|
|
|
| 1051 |
submit_btn="π Search & Summarize",
|
| 1052 |
stop_btn="βΉοΈ Stop",
|
| 1053 |
chatbot=gr.Chatbot(
|
| 1054 |
show_copy_button=True,
|
|
|
|
| 1055 |
layout="bubble",
|
| 1056 |
height=600,
|
| 1057 |
+
placeholder="π Ready to search! All scraped content will be sent to the LLM for comprehensive analysis.",
|
| 1058 |
show_share_button=True
|
| 1059 |
),
|
| 1060 |
theme=gr.themes.Soft(),
|
|
|
|
| 1066 |
|
| 1067 |
if __name__ == "__main__":
|
| 1068 |
demo = create_gradio_interface()
|
| 1069 |
+
demo.launch(share=True)
|