Shreyas94 commited on
Commit
9d35d68
Β·
verified Β·
1 Parent(s): 8675311

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -207
app.py CHANGED
@@ -399,12 +399,10 @@ class ContentScraper:
399
  except:
400
  continue
401
 
402
- # Clean and limit content
403
  if content:
404
  # Remove excessive whitespace
405
  content = ' '.join(content.split())
406
- # Limit length
407
- content = content[:3000]
408
 
409
  return content, pub_date
410
 
@@ -424,7 +422,7 @@ class ContentScraper:
424
  article.parse()
425
 
426
  if article.text and len(article.text.strip()) > 100:
427
- content = article.text.strip()[:3000]
428
  pub_date = article.publish_date.isoformat() if article.publish_date else None
429
  return content, pub_date
430
 
@@ -581,7 +579,7 @@ class EmbeddingFilter:
581
  return search_results
582
 
583
  class LLMSummarizer:
584
- """Improved summarizer with better content preparation and validation"""
585
 
586
  def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
587
  self.groq_api_key = groq_api_key
@@ -594,118 +592,30 @@ class LLMSummarizer:
594
  return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
595
 
596
  CRITICAL INSTRUCTIONS:
597
- 1. Analyze ALL provided content carefully - even if it seems only tangentially related
598
- 2. Look for connections between the query and the content, even if not immediately obvious
599
- 3. If content is about a parent company/organization mentioned in the query, include relevant information
600
- 4. Extract and synthesize any information that could be relevant to answering the user's question
601
- 5. Include specific facts, dates, numbers, and quotes when present
602
- 6. If information is contradictory between sources, mention this
603
- 7. Cite sources by mentioning the publication or website name
604
- 8. Be thorough and detailed rather than dismissive
605
 
606
- ONLY state that results are not relevant if they are completely unrelated to any aspect of the query. If there is ANY connection (like parent company info, related business segments, etc.), include that information.
607
-
608
- Format your response as a comprehensive summary, not bullet points."""
609
-
610
- def validate_content_quality(self, search_results: List[SearchResult], query: str) -> Tuple[List[SearchResult], str]:
611
- """Validate and filter content quality before summarization"""
612
- valid_results = []
613
- validation_info = []
614
-
615
- # More intelligent keyword extraction
616
- query_lower = query.lower()
617
-
618
- # Extract key entities and terms
619
- important_keywords = []
620
-
621
- # Split query into words and extract meaningful terms
622
- words = query_lower.split()
623
- for word in words:
624
- if len(word) > 2 and word not in ['news', 'latest', 'recent', 'update', 'information', 'about']:
625
- important_keywords.append(word)
626
-
627
- # Also look for multi-word entities (like company names)
628
- # Extract potential company/entity names from query
629
- entity_patterns = [
630
- r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper names
631
- r'\b[A-Z]{2,}(?:\s+[A-Z][a-z]+)*\b', # Acronyms
632
- ]
633
-
634
- for pattern in entity_patterns:
635
- matches = re.findall(pattern, query)
636
- for match in matches:
637
- important_keywords.extend(match.lower().split())
638
-
639
- # Remove duplicates
640
- important_keywords = list(set(important_keywords))
641
-
642
- for result in search_results:
643
- if not result.content or len(result.content.strip()) < 50: # Lowered threshold
644
- validation_info.append(f"Skipped '{result.title}' - insufficient content")
645
- continue
646
-
647
- # Check if content contains query-relevant terms
648
- content_lower = result.content.lower()
649
- title_lower = result.title.lower()
650
- snippet_lower = result.snippet.lower()
651
- combined_text = f"{title_lower} {snippet_lower} {content_lower}"
652
-
653
- # More flexible relevance scoring
654
- relevant_score = 0
655
- matched_keywords = []
656
-
657
- for keyword in important_keywords:
658
- if keyword in combined_text:
659
- if keyword in content_lower:
660
- relevant_score += 2
661
- matched_keywords.append(keyword)
662
- elif keyword in title_lower:
663
- relevant_score += 3 # Title matches are very important
664
- matched_keywords.append(keyword)
665
- elif keyword in snippet_lower:
666
- relevant_score += 1
667
- matched_keywords.append(keyword)
668
-
669
- # Special handling for acronyms and company names
670
- # If query contains a company acronym (like KKR), be more lenient
671
- has_company_match = any(len(kw) <= 4 and kw.isupper() for kw in query.split())
672
- if has_company_match:
673
- relevant_score += 1 # Boost score for company-related queries
674
-
675
- # Lower the threshold and accept more results
676
- if relevant_score >= 1 or len(matched_keywords) >= 1:
677
- valid_results.append(result)
678
- validation_info.append(f"βœ“ '{result.title}' - score: {relevant_score}, matched: {matched_keywords}")
679
- else:
680
- validation_info.append(f"Skipped '{result.title}' - no relevant keywords found")
681
-
682
- # If we filtered out too many results, be more lenient
683
- if len(valid_results) < len(search_results) * 0.3: # If we filtered out more than 70%
684
- validation_info.append("⚠️ Too many results filtered, being more lenient...")
685
- # Add back results that have any content
686
- for result in search_results:
687
- if result not in valid_results and result.content.strip():
688
- valid_results.append(result)
689
- validation_info.append(f"βœ“ '{result.title}' - added back (lenient mode)")
690
-
691
- validation_summary = "\n".join(validation_info)
692
- return valid_results, validation_summary
693
 
694
  def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
695
- """Prepare well-structured content for LLM"""
696
- # Validate content first
697
- valid_results, validation_info = self.validate_content_quality(search_results, query)
 
698
 
699
  if not valid_results:
700
  return f"""Query: "{query}"
701
 
702
- VALIDATION RESULTS:
703
- {validation_info}
704
-
705
- The search results did not pass the initial relevance filter, but this might be overly restrictive. Please analyze the raw content provided and extract any information that could be relevant to answering the user's query, even if the connection is not immediately obvious."""
706
 
707
  content_parts = [f'User Query: "{query}"\n']
708
- content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")
709
 
710
  for i, result in enumerate(valid_results, 1):
711
  content_parts.append(f"=== SOURCE {i} ===")
@@ -722,108 +632,39 @@ The search results did not pass the initial relevance filter, but this might be
722
  if result.snippet and not result.content.startswith(result.snippet[:50]):
723
  content_parts.append(f"Snippet: {result.snippet}")
724
 
725
- # Intelligently truncate content while preserving meaning
726
  content = result.content.strip()
727
- if len(content) > 3000:
728
- # Try to find a good breaking point
729
- truncate_at = 3000
730
- # Look for sentence endings near the truncation point
731
- for i in range(2800, 3200):
732
- if i < len(content) and content[i] in '.!?':
733
- truncate_at = i + 1
734
- break
735
- content = content[:truncate_at] + "... [content truncated]"
736
-
737
  content_parts.append(f"Content: {content}")
738
  content_parts.append("") # Empty line between sources
739
 
740
  return "\n".join(content_parts)
741
 
742
  async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
743
- temperature: float = 0.3, max_tokens: int = 2000) -> str:
744
- """Improved Groq summarization with better content preparation"""
745
  if not self.groq_api_key:
746
  return "Groq API key not provided"
747
 
748
  try:
749
- # Prepare well-structured content
750
  prepared_content = self.prepare_content_for_llm(query, search_results)
751
 
752
  # Debug output
753
  print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
754
  print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
755
- print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
756
-
757
- user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
758
-
759
- {prepared_content}
760
-
761
- Instructions:
762
- - Focus ONLY on information relevant to the query: "{query}"
763
- - If the results don't contain relevant information, explicitly state this
764
- - Be specific and factual, include dates/numbers when available
765
- - Mention source publications when referencing information
766
- - Don't provide generic advice if specific information isn't found"""
767
-
768
- headers = {
769
- "Authorization": f"Bearer {self.groq_api_key}",
770
- "Content-Type": "application/json"
771
- }
772
-
773
- payload = {
774
- "model": self.groq_model,
775
- "messages": [
776
- {"role": "system", "content": self.create_system_prompt()},
777
- {"role": "user", "content": user_prompt}
778
- ],
779
- "temperature": temperature,
780
- "max_tokens": max_tokens,
781
- "stream": False
782
- }
783
-
784
- async with aiohttp.ClientSession() as session:
785
- async with session.post("https://api.groq.com/openai/v1/chat/completions",
786
- headers=headers, json=payload) as response:
787
- if response.status == 200:
788
- result = await response.json()
789
- summary = result["choices"][0]["message"]["content"]
790
-
791
- # Add debug info in development
792
- debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
793
- return summary + debug_info
794
-
795
- else:
796
- error_text = await response.text()
797
- return f"Groq API error: {response.status} - {error_text}"
798
-
799
- except Exception as e:
800
- return f"Error with Groq summarization: {str(e)}"
801
-
802
- async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
803
- temperature: float = 0.3, max_tokens: int = 2000) -> str:
804
- """Improved OpenRouter summarization with better content preparation"""
805
- if not self.openrouter_api_key:
806
- return "OpenRouter API key not provided"
807
-
808
- try:
809
- # Prepare well-structured content
810
- prepared_content = self.prepare_content_for_llm(query, search_results)
811
-
812
- # Debug output
813
- print(f"DEBUG - Sending {len(prepared_content)} characters to OpenRouter AI")
814
- print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
815
- print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
816
 
817
  user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
818
 
819
  {prepared_content}
820
 
821
  Instructions:
822
- - Focus ONLY on information relevant to the query: "{query}"
823
- - If the results don't contain relevant information, explicitly state this
824
  - Be specific and factual, include dates/numbers when available
825
  - Mention source publications when referencing information
826
- - Don't provide generic advice if specific information isn't found"""
 
827
 
828
  headers = {
829
  "Authorization": f"Bearer {self.openrouter_api_key}",
@@ -849,8 +690,8 @@ Instructions:
849
  result = await response.json()
850
  summary = result["choices"][0]["message"]["content"]
851
 
852
- # Add debug info in development
853
- debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
854
  return summary + debug_info
855
 
856
  else:
@@ -931,9 +772,9 @@ class AISearchEngine:
931
  max_successful=target_successful
932
  )
933
 
934
- # Filter results with meaningful content
935
- results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
936
- status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
937
 
938
  # Debug: Show what content we actually got
939
  for i, result in enumerate(results_with_content[:3]):
@@ -971,8 +812,8 @@ class AISearchEngine:
971
  if not results_with_content:
972
  return "No relevant results found after filtering", "\n".join(status_updates)
973
 
974
- # Step 5: LLM Summarization
975
- status_updates.append(f"πŸ€– Generating summary using {model}...")
976
 
977
  try:
978
  if model.startswith("Groq"):
@@ -1006,6 +847,7 @@ class AISearchEngine:
1006
  metadata += f"- Search engines: {', '.join(search_engines)}\n"
1007
  metadata += f"- Model: {model}\n"
1008
  metadata += f"- Embeddings used: {use_embeddings}\n"
 
1009
 
1010
  final_summary = summary + metadata
1011
  status_updates.append(f"βœ… Summary generated in {processing_time:.2f}s")
@@ -1111,7 +953,7 @@ async def chat_inference(message, history, groq_key, openrouter_key, model_choic
1111
  yield "🧠 Filtering results using embeddings..."
1112
  await asyncio.sleep(0.1)
1113
 
1114
- yield "πŸ€– Generating AI-powered summary..."
1115
  await asyncio.sleep(0.1)
1116
 
1117
  # Perform the actual search and summarization
@@ -1182,12 +1024,12 @@ def create_gradio_interface():
1182
  info="Number of search results to fetch from each engine"
1183
  ),
1184
  gr.Slider(
1185
- minimum=500,
1186
- maximum=4000,
1187
- value=2000,
1188
- step=100,
1189
- label="πŸ“ Max Tokens",
1190
- info="Maximum length of the AI-generated summary"
1191
  )
1192
  ]
1193
 
@@ -1196,26 +1038,23 @@ def create_gradio_interface():
1196
  fn=chat_inference,
1197
  additional_inputs=additional_inputs,
1198
  additional_inputs_accordion=gr.Accordion("βš™οΈ Configuration & Advanced Parameters", open=True),
1199
- title="πŸ” AI-Powered Search Engine",
1200
  description="""
1201
  **Search across Google, Bing, and Yahoo, then get AI-powered summaries!**
1202
 
1203
  ✨ **Features:** Multi-engine search β€’ Query enhancement β€’ Parallel scraping β€’ AI summarization β€’ Embedding filtering
 
1204
 
1205
  πŸ“‹ **Quick Start:** 1) Add your API key below 2) Select search engines 3) Ask any question!
1206
  """,
1207
  cache_examples=False,
1208
- #retry_btn="πŸ”„ Retry",
1209
- #undo_btn="↩️ Undo",
1210
- #clear_btn="πŸ—‘οΈ Clear",
1211
  submit_btn="πŸ” Search & Summarize",
1212
  stop_btn="⏹️ Stop",
1213
  chatbot=gr.Chatbot(
1214
  show_copy_button=True,
1215
- #likeable=True,
1216
  layout="bubble",
1217
  height=600,
1218
- placeholder="πŸš€ Ready to search! Configure your settings below and ask me anything.",
1219
  show_share_button=True
1220
  ),
1221
  theme=gr.themes.Soft(),
@@ -1227,4 +1066,4 @@ def create_gradio_interface():
1227
 
1228
  if __name__ == "__main__":
1229
  demo = create_gradio_interface()
1230
- demo.launch(share=True)
 
399
  except:
400
  continue
401
 
402
+ # Don't limit content length here - let LLM handle full content
403
  if content:
404
  # Remove excessive whitespace
405
  content = ' '.join(content.split())
 
 
406
 
407
  return content, pub_date
408
 
 
422
  article.parse()
423
 
424
  if article.text and len(article.text.strip()) > 100:
425
+ content = article.text.strip() # Don't limit content length
426
  pub_date = article.publish_date.isoformat() if article.publish_date else None
427
  return content, pub_date
428
 
 
579
  return search_results
580
 
581
  class LLMSummarizer:
582
+ """Improved summarizer without content validation filtering - sends all scraped content to LLM"""
583
 
584
  def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
585
  self.groq_api_key = groq_api_key
 
592
  return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
593
 
594
  CRITICAL INSTRUCTIONS:
595
+ 1. Analyze ALL provided content carefully and thoroughly
596
+ 2. Extract and synthesize any information relevant to answering the user's question
597
+ 3. Include specific facts, dates, numbers, and quotes when present
598
+ 4. If information is contradictory between sources, mention this
599
+ 5. Cite sources by mentioning the publication or website name
600
+ 6. Be thorough and detailed in your analysis
601
+ 7. If some content seems tangentially related, still include relevant portions
602
+ 8. Focus on directly answering the user's query with the most relevant information first
603
 
604
+ Format your response as a comprehensive summary, not bullet points. Provide a thorough analysis of all the content provided."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
 
606
  def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
607
+ """Prepare content for LLM without validation filtering - include ALL scraped content"""
608
+
609
+ # No content validation - include all results that have any content
610
+ valid_results = [result for result in search_results if result.content.strip()]
611
 
612
  if not valid_results:
613
  return f"""Query: "{query}"
614
 
615
+ No content was successfully scraped from the search results. This might be due to anti-bot protections or network issues."""
 
 
 
616
 
617
  content_parts = [f'User Query: "{query}"\n']
618
+ content_parts.append(f"Number of sources with content: {len(valid_results)}\n")
619
 
620
  for i, result in enumerate(valid_results, 1):
621
  content_parts.append(f"=== SOURCE {i} ===")
 
632
  if result.snippet and not result.content.startswith(result.snippet[:50]):
633
  content_parts.append(f"Snippet: {result.snippet}")
634
 
635
+ # Include FULL content without truncation - let the LLM handle the large context
636
  content = result.content.strip()
 
 
 
 
 
 
 
 
 
 
637
  content_parts.append(f"Content: {content}")
638
  content_parts.append("") # Empty line between sources
639
 
640
  return "\n".join(content_parts)
641
 
642
  async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
643
+ temperature: float = 0.3, max_tokens: int = 8000) -> str:
644
+ """Enhanced Groq summarization with increased token limits and no content filtering"""
645
  if not self.groq_api_key:
646
  return "Groq API key not provided"
647
 
648
  try:
649
+ # Prepare content without validation filtering
650
  prepared_content = self.prepare_content_for_llm(query, search_results)
651
 
652
  # Debug output
653
  print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
654
  print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
655
+ print(f"DEBUG - Max completion tokens: {max_tokens}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
 
657
  user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
658
 
659
  {prepared_content}
660
 
661
  Instructions:
662
+ - Focus on information relevant to the query: "{query}"
663
+ - Analyze ALL provided content thoroughly
664
  - Be specific and factual, include dates/numbers when available
665
  - Mention source publications when referencing information
666
+ - If results contain limited relevant information, state this clearly but still extract what you can
667
+ - Provide a comprehensive analysis of all available content"""
668
 
669
  headers = {
670
  "Authorization": f"Bearer {self.openrouter_api_key}",
 
690
  result = await response.json()
691
  summary = result["choices"][0]["message"]["content"]
692
 
693
+ # Add debug info
694
+ debug_info = f"\n\n[Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
695
  return summary + debug_info
696
 
697
  else:
 
772
  max_successful=target_successful
773
  )
774
 
775
+ # Include ALL results with any content (no filtering)
776
+ results_with_content = [r for r in scraped_results if r.content.strip()]
777
+ status_updates.append(f"Successfully scraped {len(results_with_content)} articles with content")
778
 
779
  # Debug: Show what content we actually got
780
  for i, result in enumerate(results_with_content[:3]):
 
812
  if not results_with_content:
813
  return "No relevant results found after filtering", "\n".join(status_updates)
814
 
815
+ # Step 5: LLM Summarization - now sends ALL content without validation filtering
816
+ status_updates.append(f"πŸ€– Generating summary using {model} (processing all scraped content)...")
817
 
818
  try:
819
  if model.startswith("Groq"):
 
847
  metadata += f"- Search engines: {', '.join(search_engines)}\n"
848
  metadata += f"- Model: {model}\n"
849
  metadata += f"- Embeddings used: {use_embeddings}\n"
850
+ metadata += f"- Content filtering: DISABLED (all content sent to LLM)\n"
851
 
852
  final_summary = summary + metadata
853
  status_updates.append(f"βœ… Summary generated in {processing_time:.2f}s")
 
953
  yield "🧠 Filtering results using embeddings..."
954
  await asyncio.sleep(0.1)
955
 
956
+ yield "πŸ€– Generating AI-powered summary (processing all scraped content)..."
957
  await asyncio.sleep(0.1)
958
 
959
  # Perform the actual search and summarization
 
1024
  info="Number of search results to fetch from each engine"
1025
  ),
1026
  gr.Slider(
1027
+ minimum=1000,
1028
+ maximum=8000,
1029
+ value=8000,
1030
+ step=500,
1031
+ label="πŸ“ Max Completion Tokens",
1032
+ info="Maximum length of the AI-generated summary (Groq: up to 8000, OpenRouter: up to 4000)"
1033
  )
1034
  ]
1035
 
 
1038
  fn=chat_inference,
1039
  additional_inputs=additional_inputs,
1040
  additional_inputs_accordion=gr.Accordion("βš™οΈ Configuration & Advanced Parameters", open=True),
1041
+ title="πŸ” AI-Powered Search Engine - No Content Filtering",
1042
  description="""
1043
  **Search across Google, Bing, and Yahoo, then get AI-powered summaries!**
1044
 
1045
  ✨ **Features:** Multi-engine search β€’ Query enhancement β€’ Parallel scraping β€’ AI summarization β€’ Embedding filtering
1046
+ πŸš€ **Updated:** All scraped content is now sent to the LLM without filtering β€’ Increased Groq token limits (up to 8K)
1047
 
1048
  πŸ“‹ **Quick Start:** 1) Add your API key below 2) Select search engines 3) Ask any question!
1049
  """,
1050
  cache_examples=False,
 
 
 
1051
  submit_btn="πŸ” Search & Summarize",
1052
  stop_btn="⏹️ Stop",
1053
  chatbot=gr.Chatbot(
1054
  show_copy_button=True,
 
1055
  layout="bubble",
1056
  height=600,
1057
+ placeholder="πŸš€ Ready to search! All scraped content will be sent to the LLM for comprehensive analysis.",
1058
  show_share_button=True
1059
  ),
1060
  theme=gr.themes.Soft(),
 
1066
 
1067
  if __name__ == "__main__":
1068
  demo = create_gradio_interface()
1069
+ demo.launch(share=True)