Shreyas94 commited on
Commit
cf5225f
·
verified ·
1 Parent(s): 75acc4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -53
app.py CHANGED
@@ -581,7 +581,7 @@ class EmbeddingFilter:
581
  return search_results
582
 
583
  class LLMSummarizer:
584
- """Summarize search results using Groq or OpenRouter APIs"""
585
 
586
  def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
587
  self.groq_api_key = groq_api_key
@@ -591,48 +591,132 @@ class LLMSummarizer:
591
 
592
  def create_system_prompt(self) -> str:
593
  """Create system prompt for summarization"""
594
- return """You are an expert summarizer. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
595
 
596
- Instructions:
597
- 1. Focus only on information relevant to the user's query
598
- 2. Filter out noise, advertisements, and unrelated content
599
- 3. Synthesize information from multiple sources when possible
600
- 4. Maintain factual accuracy and cite sources when appropriate
601
- 5. If information is contradictory, note the discrepancies
602
- 6. Provide a clear, concise summary that directly addresses the query
603
- 7. Include relevant dates, numbers, and specific details when available
 
 
 
604
 
605
  Format your response as a comprehensive summary, not bullet points."""
606
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
608
  temperature: float = 0.3, max_tokens: int = 2000) -> str:
609
- """Summarize using Groq API"""
610
  if not self.groq_api_key:
611
  return "Groq API key not provided"
612
 
613
  try:
614
- # Prepare the content for summarization
615
- content_json = {
616
- "user_query": query,
617
- "search_results": []
618
- }
619
 
620
- for result in search_results:
621
- content_json["search_results"].append({
622
- "title": result.title,
623
- "url": result.url,
624
- "snippet": result.snippet,
625
- "content": result.content[:2000], # Limit content length
626
- "publication_date": result.publication_date,
627
- "relevance_score": result.relevance_score
628
- })
629
 
630
- user_prompt = f"""Please summarize the following search results for the query: "{query}"
631
 
632
- Search Results Data:
633
- {json.dumps(content_json, indent=2)}
634
 
635
- Provide a comprehensive summary that directly answers the user's query based on the most relevant and recent information available."""
 
 
 
 
 
636
 
637
  headers = {
638
  "Authorization": f"Bearer {self.groq_api_key}",
@@ -646,7 +730,8 @@ Provide a comprehensive summary that directly answers the user's query based on
646
  {"role": "user", "content": user_prompt}
647
  ],
648
  "temperature": temperature,
649
- "max_tokens": max_tokens
 
650
  }
651
 
652
  async with aiohttp.ClientSession() as session:
@@ -654,43 +739,44 @@ Provide a comprehensive summary that directly answers the user's query based on
654
  headers=headers, json=payload) as response:
655
  if response.status == 200:
656
  result = await response.json()
657
- return result["choices"][0]["message"]["content"]
 
 
 
 
 
658
  else:
659
  error_text = await response.text()
660
  return f"Groq API error: {response.status} - {error_text}"
661
 
662
  except Exception as e:
663
  return f"Error with Groq summarization: {str(e)}"
664
-
665
  async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
666
  temperature: float = 0.3, max_tokens: int = 2000) -> str:
667
- """Summarize using OpenRouter API"""
668
  if not self.openrouter_api_key:
669
  return "OpenRouter API key not provided"
670
 
671
  try:
672
- # Prepare the content for summarization
673
- content_json = {
674
- "user_query": query,
675
- "search_results": []
676
- }
677
 
678
- for result in search_results:
679
- content_json["search_results"].append({
680
- "title": result.title,
681
- "url": result.url,
682
- "snippet": result.snippet,
683
- "content": result.content[:2000], # Limit content length
684
- "publication_date": result.publication_date,
685
- "relevance_score": result.relevance_score
686
- })
687
 
688
- user_prompt = f"""Please summarize the following search results for the query: "{query}"
689
 
690
- Search Results Data:
691
- {json.dumps(content_json, indent=2)}
692
 
693
- Provide a comprehensive summary that directly answers the user's query based on the most relevant and recent information available."""
 
 
 
 
 
694
 
695
  headers = {
696
  "Authorization": f"Bearer {self.openrouter_api_key}",
@@ -714,7 +800,12 @@ Provide a comprehensive summary that directly answers the user's query based on
714
  headers=headers, json=payload) as response:
715
  if response.status == 200:
716
  result = await response.json()
717
- return result["choices"][0]["message"]["content"]
 
 
 
 
 
718
  else:
719
  error_text = await response.text()
720
  return f"OpenRouter API error: {response.status} - {error_text}"
@@ -797,6 +888,13 @@ class AISearchEngine:
797
  results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
798
  status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
799
 
 
 
 
 
 
 
 
800
  # If we don't have enough content, try to get some from snippets
801
  if len(results_with_content) < 3:
802
  status_updates.append("Using search snippets as fallback content...")
 
581
  return search_results
582
 
583
  class LLMSummarizer:
584
+ """Improved summarizer with better content preparation and validation"""
585
 
586
  def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
587
  self.groq_api_key = groq_api_key
 
591
 
592
  def create_system_prompt(self) -> str:
593
  """Create system prompt for summarization"""
594
+ return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
595
 
596
+ CRITICAL INSTRUCTIONS:
597
+ 1. ONLY use information that is directly relevant to the user's query
598
+ 2. If the search results don't contain relevant information, explicitly state this
599
+ 3. Don't make up information or provide generic advice
600
+ 4. Synthesize information from multiple sources when available
601
+ 5. Include specific facts, dates, numbers, and quotes when present
602
+ 6. If information is contradictory between sources, mention this
603
+ 7. Cite sources by mentioning the publication or website name
604
+ 8. Be specific and detailed rather than vague
605
+
606
+ If the search results are not relevant to the query, respond with: "The search results do not contain sufficient relevant information to answer your query about [topic]. The results primarily contained [brief description of what was actually found]."
607
 
608
  Format your response as a comprehensive summary, not bullet points."""
609
+
610
+ def validate_content_quality(self, search_results: List[SearchResult], query: str) -> Tuple[List[SearchResult], str]:
611
+ """Validate and filter content quality before summarization"""
612
+ valid_results = []
613
+ validation_info = []
614
+
615
+ query_keywords = set(query.lower().split())
616
+
617
+ for result in search_results:
618
+ if not result.content or len(result.content.strip()) < 100:
619
+ validation_info.append(f"Skipped '{result.title}' - insufficient content")
620
+ continue
621
+
622
+ # Check if content contains query-relevant terms
623
+ content_lower = result.content.lower()
624
+ title_lower = result.title.lower()
625
+ snippet_lower = result.snippet.lower()
626
+
627
+ # Count relevant keywords
628
+ relevant_score = 0
629
+ for keyword in query_keywords:
630
+ if len(keyword) > 2: # Skip very short words
631
+ if keyword in content_lower:
632
+ relevant_score += 2
633
+ elif keyword in title_lower:
634
+ relevant_score += 1
635
+ elif keyword in snippet_lower:
636
+ relevant_score += 0.5
637
+
638
+ if relevant_score > 0:
639
+ valid_results.append(result)
640
+ validation_info.append(f"✓ '{result.title}' - relevance score: {relevant_score}")
641
+ else:
642
+ validation_info.append(f"Skipped '{result.title}' - not relevant to query")
643
+
644
+ validation_summary = "\n".join(validation_info)
645
+ return valid_results, validation_summary
646
+
647
+ def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
648
+ """Prepare well-structured content for LLM"""
649
+ # Validate content first
650
+ valid_results, validation_info = self.validate_content_quality(search_results, query)
651
+
652
+ if not valid_results:
653
+ return f"""Query: "{query}"
654
+
655
+ VALIDATION RESULTS:
656
+ {validation_info}
657
+
658
+ No search results contained relevant content for this query. Please provide a response indicating that insufficient relevant information was found."""
659
+
660
+ content_parts = [f'User Query: "{query}"\n']
661
+ content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")
662
+
663
+ for i, result in enumerate(valid_results, 1):
664
+ content_parts.append(f"=== SOURCE {i} ===")
665
+ content_parts.append(f"Title: {result.title}")
666
+ content_parts.append(f"URL: {result.url}")
667
+
668
+ if result.publication_date:
669
+ content_parts.append(f"Date: {result.publication_date}")
670
+
671
+ if result.relevance_score > 0:
672
+ content_parts.append(f"Relevance Score: {result.relevance_score:.3f}")
673
+
674
+ # Include snippet if it's different from content start
675
+ if result.snippet and not result.content.startswith(result.snippet[:50]):
676
+ content_parts.append(f"Snippet: {result.snippet}")
677
+
678
+ # Intelligently truncate content while preserving meaning
679
+ content = result.content.strip()
680
+ if len(content) > 3000:
681
+ # Try to find a good breaking point
682
+ truncate_at = 3000
683
+ # Look for sentence endings near the truncation point
684
+ for i in range(2800, 3200):
685
+ if i < len(content) and content[i] in '.!?':
686
+ truncate_at = i + 1
687
+ break
688
+ content = content[:truncate_at] + "... [content truncated]"
689
+
690
+ content_parts.append(f"Content: {content}")
691
+ content_parts.append("") # Empty line between sources
692
+
693
+ return "\n".join(content_parts)
694
+
695
  async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
696
  temperature: float = 0.3, max_tokens: int = 2000) -> str:
697
+ """Improved Groq summarization with better content preparation"""
698
  if not self.groq_api_key:
699
  return "Groq API key not provided"
700
 
701
  try:
702
+ # Prepare well-structured content
703
+ prepared_content = self.prepare_content_for_llm(query, search_results)
 
 
 
704
 
705
+ # Debug output
706
+ print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
707
+ print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
708
+ print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
 
 
 
 
 
709
 
710
+ user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
711
 
712
+ {prepared_content}
 
713
 
714
+ Instructions:
715
+ - Focus ONLY on information relevant to the query: "{query}"
716
+ - If the results don't contain relevant information, explicitly state this
717
+ - Be specific and factual, include dates/numbers when available
718
+ - Mention source publications when referencing information
719
+ - Don't provide generic advice if specific information isn't found"""
720
 
721
  headers = {
722
  "Authorization": f"Bearer {self.groq_api_key}",
 
730
  {"role": "user", "content": user_prompt}
731
  ],
732
  "temperature": temperature,
733
+ "max_tokens": max_tokens,
734
+ "stream": False
735
  }
736
 
737
  async with aiohttp.ClientSession() as session:
 
739
  headers=headers, json=payload) as response:
740
  if response.status == 200:
741
  result = await response.json()
742
+ summary = result["choices"][0]["message"]["content"]
743
+
744
+ # Add debug info in development
745
+ debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
746
+ return summary + debug_info
747
+
748
  else:
749
  error_text = await response.text()
750
  return f"Groq API error: {response.status} - {error_text}"
751
 
752
  except Exception as e:
753
  return f"Error with Groq summarization: {str(e)}"
754
+
755
  async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
756
  temperature: float = 0.3, max_tokens: int = 2000) -> str:
757
+ """Improved OpenRouter summarization with better content preparation"""
758
  if not self.openrouter_api_key:
759
  return "OpenRouter API key not provided"
760
 
761
  try:
762
+ # Prepare well-structured content
763
+ prepared_content = self.prepare_content_for_llm(query, search_results)
 
 
 
764
 
765
+ # Debug output
766
+ print(f"DEBUG - Sending {len(prepared_content)} characters to OpenRouter AI")
767
+ print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
768
+ print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
 
 
 
 
 
769
 
770
+ user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
771
 
772
+ {prepared_content}
 
773
 
774
+ Instructions:
775
+ - Focus ONLY on information relevant to the query: "{query}"
776
+ - If the results don't contain relevant information, explicitly state this
777
+ - Be specific and factual, include dates/numbers when available
778
+ - Mention source publications when referencing information
779
+ - Don't provide generic advice if specific information isn't found"""
780
 
781
  headers = {
782
  "Authorization": f"Bearer {self.openrouter_api_key}",
 
800
  headers=headers, json=payload) as response:
801
  if response.status == 200:
802
  result = await response.json()
803
+ summary = result["choices"][0]["message"]["content"]
804
+
805
+ # Add debug info in development
806
+ debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
807
+ return summary + debug_info
808
+
809
  else:
810
  error_text = await response.text()
811
  return f"OpenRouter API error: {response.status} - {error_text}"
 
888
  results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
889
  status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
890
 
891
+ # Debug: Show what content we actually got
892
+ for i, result in enumerate(results_with_content[:3]):
893
+ print(f"Result {i+1}: {result.title}")
894
+ print(f"Content length: {len(result.content)}")
895
+ print(f"Content preview: {result.content[:200]}...")
896
+ print("---")
897
+
898
  # If we don't have enough content, try to get some from snippets
899
  if len(results_with_content) < 3:
900
  status_updates.append("Using search snippets as fallback content...")