Spaces:

Shreyas94
/

Sentinel02

Sleeping

App Files Files Community

Shreyas94 commited on Aug 4, 2025

Commit

cf5225f

verified ·

1 Parent(s): 75acc4f

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -53

app.py CHANGED Viewed

@@ -581,7 +581,7 @@ class EmbeddingFilter:
             return search_results
 class LLMSummarizer:
-    """Summarize search results using Groq or OpenRouter APIs"""
     def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
         self.groq_api_key = groq_api_key
@@ -591,48 +591,132 @@ class LLMSummarizer:
     def create_system_prompt(self) -> str:
         """Create system prompt for summarization"""
-        return """You are an expert summarizer. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
-Instructions:
-1. Focus only on information relevant to the user's query
-2. Filter out noise, advertisements, and unrelated content
-3. Synthesize information from multiple sources when possible
-4. Maintain factual accuracy and cite sources when appropriate
-5. If information is contradictory, note the discrepancies
-6. Provide a clear, concise summary that directly addresses the query
-7. Include relevant dates, numbers, and specific details when available
 Format your response as a comprehensive summary, not bullet points."""
     async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
                                 temperature: float = 0.3, max_tokens: int = 2000) -> str:
-        """Summarize using Groq API"""
         if not self.groq_api_key:
             return "Groq API key not provided"
         try:
-            # Prepare the content for summarization
-            content_json = {
-                "user_query": query,
-                "search_results": []
-            }
-            for result in search_results:
-                content_json["search_results"].append({
-                    "title": result.title,
-                    "url": result.url,
-                    "snippet": result.snippet,
-                    "content": result.content[:2000],  # Limit content length
-                    "publication_date": result.publication_date,
-                    "relevance_score": result.relevance_score
-                })
-            user_prompt = f"""Please summarize the following search results for the query: "{query}"
-Search Results Data:
-{json.dumps(content_json, indent=2)}
-Provide a comprehensive summary that directly answers the user's query based on the most relevant and recent information available."""
             headers = {
                 "Authorization": f"Bearer {self.groq_api_key}",
@@ -646,7 +730,8 @@ Provide a comprehensive summary that directly answers the user's query based on
                     {"role": "user", "content": user_prompt}
                 ],
                 "temperature": temperature,
-                "max_tokens": max_tokens
             }
             async with aiohttp.ClientSession() as session:
@@ -654,43 +739,44 @@ Provide a comprehensive summary that directly answers the user's query based on
                                       headers=headers, json=payload) as response:
                     if response.status == 200:
                         result = await response.json()
-                        return result["choices"][0]["message"]["content"]
                     else:
                         error_text = await response.text()
                         return f"Groq API error: {response.status} - {error_text}"
         except Exception as e:
             return f"Error with Groq summarization: {str(e)}"
     async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
                                       temperature: float = 0.3, max_tokens: int = 2000) -> str:
-        """Summarize using OpenRouter API"""
         if not self.openrouter_api_key:
             return "OpenRouter API key not provided"
         try:
-            # Prepare the content for summarization
-            content_json = {
-                "user_query": query,
-                "search_results": []
-            }
-            for result in search_results:
-                content_json["search_results"].append({
-                    "title": result.title,
-                    "url": result.url,
-                    "snippet": result.snippet,
-                    "content": result.content[:2000],  # Limit content length
-                    "publication_date": result.publication_date,
-                    "relevance_score": result.relevance_score
-                })
-            user_prompt = f"""Please summarize the following search results for the query: "{query}"
-Search Results Data:
-{json.dumps(content_json, indent=2)}
-Provide a comprehensive summary that directly answers the user's query based on the most relevant and recent information available."""
             headers = {
                 "Authorization": f"Bearer {self.openrouter_api_key}",
@@ -714,7 +800,12 @@ Provide a comprehensive summary that directly answers the user's query based on
                                       headers=headers, json=payload) as response:
                     if response.status == 200:
                         result = await response.json()
-                        return result["choices"][0]["message"]["content"]
                     else:
                         error_text = await response.text()
                         return f"OpenRouter API error: {response.status} - {error_text}"
@@ -797,6 +888,13 @@ class AISearchEngine:
             results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
             status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
             # If we don't have enough content, try to get some from snippets
             if len(results_with_content) < 3:
                 status_updates.append("Using search snippets as fallback content...")

             return search_results
 class LLMSummarizer:
+    """Improved summarizer with better content preparation and validation"""
     def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
         self.groq_api_key = groq_api_key
     def create_system_prompt(self) -> str:
         """Create system prompt for summarization"""
+        return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
+CRITICAL INSTRUCTIONS:
+1. ONLY use information that is directly relevant to the user's query
+2. If the search results don't contain relevant information, explicitly state this
+3. Don't make up information or provide generic advice
+4. Synthesize information from multiple sources when available
+5. Include specific facts, dates, numbers, and quotes when present
+6. If information is contradictory between sources, mention this
+7. Cite sources by mentioning the publication or website name
+8. Be specific and detailed rather than vague
+If the search results are not relevant to the query, respond with: "The search results do not contain sufficient relevant information to answer your query about [topic]. The results primarily contained [brief description of what was actually found]."
 Format your response as a comprehensive summary, not bullet points."""
+    def validate_content_quality(self, search_results: List[SearchResult], query: str) -> Tuple[List[SearchResult], str]:
+        """Validate and filter content quality before summarization"""
+        valid_results = []
+        validation_info = []
+        query_keywords = set(query.lower().split())
+        for result in search_results:
+            if not result.content or len(result.content.strip()) < 100:
+                validation_info.append(f"Skipped '{result.title}' - insufficient content")
+                continue
+            # Check if content contains query-relevant terms
+            content_lower = result.content.lower()
+            title_lower = result.title.lower()
+            snippet_lower = result.snippet.lower()
+            # Count relevant keywords
+            relevant_score = 0
+            for keyword in query_keywords:
+                if len(keyword) > 2:  # Skip very short words
+                    if keyword in content_lower:
+                        relevant_score += 2
+                    elif keyword in title_lower:
+                        relevant_score += 1
+                    elif keyword in snippet_lower:
+                        relevant_score += 0.5
+            if relevant_score > 0:
+                valid_results.append(result)
+                validation_info.append(f"✓ '{result.title}' - relevance score: {relevant_score}")
+            else:
+                validation_info.append(f"Skipped '{result.title}' - not relevant to query")
+        validation_summary = "\n".join(validation_info)
+        return valid_results, validation_summary
+    def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
+        """Prepare well-structured content for LLM"""
+        # Validate content first
+        valid_results, validation_info = self.validate_content_quality(search_results, query)
+        if not valid_results:
+            return f"""Query: "{query}"
+VALIDATION RESULTS:
+{validation_info}
+No search results contained relevant content for this query. Please provide a response indicating that insufficient relevant information was found."""
+        content_parts = [f'User Query: "{query}"\n']
+        content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")
+        for i, result in enumerate(valid_results, 1):
+            content_parts.append(f"=== SOURCE {i} ===")
+            content_parts.append(f"Title: {result.title}")
+            content_parts.append(f"URL: {result.url}")
+            if result.publication_date:
+                content_parts.append(f"Date: {result.publication_date}")
+            if result.relevance_score > 0:
+                content_parts.append(f"Relevance Score: {result.relevance_score:.3f}")
+            # Include snippet if it's different from content start
+            if result.snippet and not result.content.startswith(result.snippet[:50]):
+                content_parts.append(f"Snippet: {result.snippet}")
+            # Intelligently truncate content while preserving meaning
+            content = result.content.strip()
+            if len(content) > 3000:
+                # Try to find a good breaking point
+                truncate_at = 3000
+                # Look for sentence endings near the truncation point
+                for i in range(2800, 3200):
+                    if i < len(content) and content[i] in '.!?':
+                        truncate_at = i + 1
+                        break
+                content = content[:truncate_at] + "... [content truncated]"
+            content_parts.append(f"Content: {content}")
+            content_parts.append("")  # Empty line between sources
+        return "\n".join(content_parts)
     async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
                                 temperature: float = 0.3, max_tokens: int = 2000) -> str:
+        """Improved Groq summarization with better content preparation"""
         if not self.groq_api_key:
             return "Groq API key not provided"
         try:
+            # Prepare well-structured content
+            prepared_content = self.prepare_content_for_llm(query, search_results)
+            # Debug output
+            print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
+            print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
+            print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
+            user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
+{prepared_content}
+Instructions:
+- Focus ONLY on information relevant to the query: "{query}"
+- If the results don't contain relevant information, explicitly state this
+- Be specific and factual, include dates/numbers when available
+- Mention source publications when referencing information
+- Don't provide generic advice if specific information isn't found"""
             headers = {
                 "Authorization": f"Bearer {self.groq_api_key}",
                     {"role": "user", "content": user_prompt}
                 ],
                 "temperature": temperature,
+                "max_tokens": max_tokens,
+                "stream": False
             }
             async with aiohttp.ClientSession() as session:
                                       headers=headers, json=payload) as response:
                     if response.status == 200:
                         result = await response.json()
+                        summary = result["choices"][0]["message"]["content"]
+                        # Add debug info in development
+                        debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
+                        return summary + debug_info
                     else:
                         error_text = await response.text()
                         return f"Groq API error: {response.status} - {error_text}"
         except Exception as e:
             return f"Error with Groq summarization: {str(e)}"
     async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
                                       temperature: float = 0.3, max_tokens: int = 2000) -> str:
+        """Improved OpenRouter summarization with better content preparation"""
         if not self.openrouter_api_key:
             return "OpenRouter API key not provided"
         try:
+            # Prepare well-structured content
+            prepared_content = self.prepare_content_for_llm(query, search_results)
+            # Debug output
+            print(f"DEBUG - Sending {len(prepared_content)} characters to OpenRouter AI")
+            print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
+            print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
+            user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
+{prepared_content}
+Instructions:
+- Focus ONLY on information relevant to the query: "{query}"
+- If the results don't contain relevant information, explicitly state this
+- Be specific and factual, include dates/numbers when available
+- Mention source publications when referencing information
+- Don't provide generic advice if specific information isn't found"""
             headers = {
                 "Authorization": f"Bearer {self.openrouter_api_key}",
                                       headers=headers, json=payload) as response:
                     if response.status == 200:
                         result = await response.json()
+                        summary = result["choices"][0]["message"]["content"]
+                        # Add debug info in development
+                        debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
+                        return summary + debug_info
                     else:
                         error_text = await response.text()
                         return f"OpenRouter API error: {response.status} - {error_text}"
             results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
             status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
+            # Debug: Show what content we actually got
+            for i, result in enumerate(results_with_content[:3]):
+                print(f"Result {i+1}: {result.title}")
+                print(f"Content length: {len(result.content)}")
+                print(f"Content preview: {result.content[:200]}...")
+                print("---")
             # If we don't have enough content, try to get some from snippets
             if len(results_with_content) < 3:
                 status_updates.append("Using search snippets as fallback content...")