Spaces:

Shreyas94
/

Sentinel02

Sleeping

App Files Files Community

Shreyas94 commited on Aug 4

Commit

9d35d68

verified ·

1 Parent(s): 8675311

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -207

app.py CHANGED Viewed

@@ -399,12 +399,10 @@ class ContentScraper:
                     except:
                         continue
-                # Clean and limit content
                 if content:
                     # Remove excessive whitespace
                     content = ' '.join(content.split())
-                    # Limit length
-                    content = content[:3000]
                 return content, pub_date
@@ -424,7 +422,7 @@ class ContentScraper:
             article.parse()
             if article.text and len(article.text.strip()) > 100:
-                content = article.text.strip()[:3000]
                 pub_date = article.publish_date.isoformat() if article.publish_date else None
                 return content, pub_date
@@ -581,7 +579,7 @@ class EmbeddingFilter:
             return search_results
 class LLMSummarizer:
-    """Improved summarizer with better content preparation and validation"""
     def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
         self.groq_api_key = groq_api_key
@@ -594,118 +592,30 @@ class LLMSummarizer:
         return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
 CRITICAL INSTRUCTIONS:
-1. Analyze ALL provided content carefully - even if it seems only tangentially related
-2. Look for connections between the query and the content, even if not immediately obvious
-3. If content is about a parent company/organization mentioned in the query, include relevant information
-4. Extract and synthesize any information that could be relevant to answering the user's question
-5. Include specific facts, dates, numbers, and quotes when present
-6. If information is contradictory between sources, mention this
-7. Cite sources by mentioning the publication or website name
-8. Be thorough and detailed rather than dismissive
-ONLY state that results are not relevant if they are completely unrelated to any aspect of the query. If there is ANY connection (like parent company info, related business segments, etc.), include that information.
-Format your response as a comprehensive summary, not bullet points."""
-    def validate_content_quality(self, search_results: List[SearchResult], query: str) -> Tuple[List[SearchResult], str]:
-        """Validate and filter content quality before summarization"""
-        valid_results = []
-        validation_info = []
-        # More intelligent keyword extraction
-        query_lower = query.lower()
-        # Extract key entities and terms
-        important_keywords = []
-        # Split query into words and extract meaningful terms
-        words = query_lower.split()
-        for word in words:
-            if len(word) > 2 and word not in ['news', 'latest', 'recent', 'update', 'information', 'about']:
-                important_keywords.append(word)
-        # Also look for multi-word entities (like company names)
-        # Extract potential company/entity names from query
-        entity_patterns = [
-            r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b',  # Proper names
-            r'\b[A-Z]{2,}(?:\s+[A-Z][a-z]+)*\b',  # Acronyms
-        ]
-        for pattern in entity_patterns:
-            matches = re.findall(pattern, query)
-            for match in matches:
-                important_keywords.extend(match.lower().split())
-        # Remove duplicates
-        important_keywords = list(set(important_keywords))
-        for result in search_results:
-            if not result.content or len(result.content.strip()) < 50:  # Lowered threshold
-                validation_info.append(f"Skipped '{result.title}' - insufficient content")
-                continue
-            # Check if content contains query-relevant terms
-            content_lower = result.content.lower()
-            title_lower = result.title.lower()
-            snippet_lower = result.snippet.lower()
-            combined_text = f"{title_lower} {snippet_lower} {content_lower}"
-            # More flexible relevance scoring
-            relevant_score = 0
-            matched_keywords = []
-            for keyword in important_keywords:
-                if keyword in combined_text:
-                    if keyword in content_lower:
-                        relevant_score += 2
-                        matched_keywords.append(keyword)
-                    elif keyword in title_lower:
-                        relevant_score += 3  # Title matches are very important
-                        matched_keywords.append(keyword)
-                    elif keyword in snippet_lower:
-                        relevant_score += 1
-                        matched_keywords.append(keyword)
-            # Special handling for acronyms and company names
-            # If query contains a company acronym (like KKR), be more lenient
-            has_company_match = any(len(kw) <= 4 and kw.isupper() for kw in query.split())
-            if has_company_match:
-                relevant_score += 1  # Boost score for company-related queries
-            # Lower the threshold and accept more results
-            if relevant_score >= 1 or len(matched_keywords) >= 1:
-                valid_results.append(result)
-                validation_info.append(f"✓ '{result.title}' - score: {relevant_score}, matched: {matched_keywords}")
-            else:
-                validation_info.append(f"Skipped '{result.title}' - no relevant keywords found")
-        # If we filtered out too many results, be more lenient
-        if len(valid_results) < len(search_results) * 0.3:  # If we filtered out more than 70%
-            validation_info.append("⚠️ Too many results filtered, being more lenient...")
-            # Add back results that have any content
-            for result in search_results:
-                if result not in valid_results and result.content.strip():
-                    valid_results.append(result)
-                    validation_info.append(f"✓ '{result.title}' - added back (lenient mode)")
-        validation_summary = "\n".join(validation_info)
-        return valid_results, validation_summary
     def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
-        """Prepare well-structured content for LLM"""
-        # Validate content first
-        valid_results, validation_info = self.validate_content_quality(search_results, query)
         if not valid_results:
             return f"""Query: "{query}"
-VALIDATION RESULTS:
-{validation_info}
-The search results did not pass the initial relevance filter, but this might be overly restrictive. Please analyze the raw content provided and extract any information that could be relevant to answering the user's query, even if the connection is not immediately obvious."""
         content_parts = [f'User Query: "{query}"\n']
-        content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")
         for i, result in enumerate(valid_results, 1):
             content_parts.append(f"=== SOURCE {i} ===")
@@ -722,108 +632,39 @@ The search results did not pass the initial relevance filter, but this might be
             if result.snippet and not result.content.startswith(result.snippet[:50]):
                 content_parts.append(f"Snippet: {result.snippet}")
-            # Intelligently truncate content while preserving meaning
             content = result.content.strip()
-            if len(content) > 3000:
-                # Try to find a good breaking point
-                truncate_at = 3000
-                # Look for sentence endings near the truncation point
-                for i in range(2800, 3200):
-                    if i < len(content) and content[i] in '.!?':
-                        truncate_at = i + 1
-                        break
-                content = content[:truncate_at] + "... [content truncated]"
             content_parts.append(f"Content: {content}")
             content_parts.append("")  # Empty line between sources
         return "\n".join(content_parts)
     async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
-                                temperature: float = 0.3, max_tokens: int = 2000) -> str:
-        """Improved Groq summarization with better content preparation"""
         if not self.groq_api_key:
             return "Groq API key not provided"
         try:
-            # Prepare well-structured content
             prepared_content = self.prepare_content_for_llm(query, search_results)
             # Debug output
             print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
             print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
-            print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
-            user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
-{prepared_content}
-Instructions:
-- Focus ONLY on information relevant to the query: "{query}"
-- If the results don't contain relevant information, explicitly state this
-- Be specific and factual, include dates/numbers when available
-- Mention source publications when referencing information
-- Don't provide generic advice if specific information isn't found"""
-            headers = {
-                "Authorization": f"Bearer {self.groq_api_key}",
-                "Content-Type": "application/json"
-            }
-            payload = {
-                "model": self.groq_model,
-                "messages": [
-                    {"role": "system", "content": self.create_system_prompt()},
-                    {"role": "user", "content": user_prompt}
-                ],
-                "temperature": temperature,
-                "max_tokens": max_tokens,
-                "stream": False
-            }
-            async with aiohttp.ClientSession() as session:
-                async with session.post("https://api.groq.com/openai/v1/chat/completions",
-                                      headers=headers, json=payload) as response:
-                    if response.status == 200:
-                        result = await response.json()
-                        summary = result["choices"][0]["message"]["content"]
-                        # Add debug info in development
-                        debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
-                        return summary + debug_info
-                    else:
-                        error_text = await response.text()
-                        return f"Groq API error: {response.status} - {error_text}"
-        except Exception as e:
-            return f"Error with Groq summarization: {str(e)}"
-    async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
-                                      temperature: float = 0.3, max_tokens: int = 2000) -> str:
-        """Improved OpenRouter summarization with better content preparation"""
-        if not self.openrouter_api_key:
-            return "OpenRouter API key not provided"
-        try:
-            # Prepare well-structured content
-            prepared_content = self.prepare_content_for_llm(query, search_results)
-            # Debug output
-            print(f"DEBUG - Sending {len(prepared_content)} characters to OpenRouter AI")
-            print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
-            print(f"DEBUG - First 300 chars: {prepared_content[:300]}...")
             user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
 {prepared_content}
 Instructions:
-- Focus ONLY on information relevant to the query: "{query}"
-- If the results don't contain relevant information, explicitly state this
 - Be specific and factual, include dates/numbers when available
 - Mention source publications when referencing information
-- Don't provide generic advice if specific information isn't found"""
             headers = {
                 "Authorization": f"Bearer {self.openrouter_api_key}",
@@ -849,8 +690,8 @@ Instructions:
                         result = await response.json()
                         summary = result["choices"][0]["message"]["content"]
-                        # Add debug info in development
-                        debug_info = f"\n\n[DEBUG - Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
                         return summary + debug_info
                     else:
@@ -931,9 +772,9 @@ class AISearchEngine:
                 max_successful=target_successful
             )
-            # Filter results with meaningful content
-            results_with_content = [r for r in scraped_results if r.content.strip() and len(r.content.strip()) > 100]
-            status_updates.append(f"Successfully scraped {len(results_with_content)} articles with meaningful content")
             # Debug: Show what content we actually got
             for i, result in enumerate(results_with_content[:3]):
@@ -971,8 +812,8 @@ class AISearchEngine:
             if not results_with_content:
                 return "No relevant results found after filtering", "\n".join(status_updates)
-            # Step 5: LLM Summarization
-            status_updates.append(f"🤖 Generating summary using {model}...")
             try:
                 if model.startswith("Groq"):
@@ -1006,6 +847,7 @@ class AISearchEngine:
             metadata += f"- Search engines: {', '.join(search_engines)}\n"
             metadata += f"- Model: {model}\n"
             metadata += f"- Embeddings used: {use_embeddings}\n"
             final_summary = summary + metadata
             status_updates.append(f"✅ Summary generated in {processing_time:.2f}s")
@@ -1111,7 +953,7 @@ async def chat_inference(message, history, groq_key, openrouter_key, model_choic
             yield "🧠 Filtering results using embeddings..."
             await asyncio.sleep(0.1)
-        yield "🤖 Generating AI-powered summary..."
         await asyncio.sleep(0.1)
         # Perform the actual search and summarization
@@ -1182,12 +1024,12 @@ def create_gradio_interface():
             info="Number of search results to fetch from each engine"
         ),
         gr.Slider(
-            minimum=500,
-            maximum=4000,
-            value=2000,
-            step=100,
-            label="📝 Max Tokens",
-            info="Maximum length of the AI-generated summary"
         )
     ]
@@ -1196,26 +1038,23 @@ def create_gradio_interface():
         fn=chat_inference,
         additional_inputs=additional_inputs,
         additional_inputs_accordion=gr.Accordion("⚙️ Configuration & Advanced Parameters", open=True),
-        title="🔍 AI-Powered Search Engine",
         description="""
         **Search across Google, Bing, and Yahoo, then get AI-powered summaries!**
         ✨ **Features:** Multi-engine search • Query enhancement • Parallel scraping • AI summarization • Embedding filtering
         📋 **Quick Start:** 1) Add your API key below 2) Select search engines 3) Ask any question!
         """,
         cache_examples=False,
-        #retry_btn="🔄 Retry",
-        #undo_btn="↩️ Undo",
-        #clear_btn="🗑️ Clear",
         submit_btn="🔍 Search & Summarize",
         stop_btn="⏹️ Stop",
         chatbot=gr.Chatbot(
             show_copy_button=True,
-            #likeable=True,
             layout="bubble",
             height=600,
-            placeholder="🚀 Ready to search! Configure your settings below and ask me anything.",
             show_share_button=True
         ),
         theme=gr.themes.Soft(),
@@ -1227,4 +1066,4 @@ def create_gradio_interface():
 if __name__ == "__main__":
     demo = create_gradio_interface()
-    demo.launch(share=True)

                     except:
                         continue
+                # Don't limit content length here - let LLM handle full content
                 if content:
                     # Remove excessive whitespace
                     content = ' '.join(content.split())
                 return content, pub_date
             article.parse()
             if article.text and len(article.text.strip()) > 100:
+                content = article.text.strip()  # Don't limit content length
                 pub_date = article.publish_date.isoformat() if article.publish_date else None
                 return content, pub_date
             return search_results
 class LLMSummarizer:
+    """Improved summarizer without content validation filtering - sends all scraped content to LLM"""
     def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
         self.groq_api_key = groq_api_key
         return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
 CRITICAL INSTRUCTIONS:
+1. Analyze ALL provided content carefully and thoroughly
+2. Extract and synthesize any information relevant to answering the user's question
+3. Include specific facts, dates, numbers, and quotes when present
+4. If information is contradictory between sources, mention this
+5. Cite sources by mentioning the publication or website name
+6. Be thorough and detailed in your analysis
+7. If some content seems tangentially related, still include relevant portions
+8. Focus on directly answering the user's query with the most relevant information first
+Format your response as a comprehensive summary, not bullet points. Provide a thorough analysis of all the content provided."""
     def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
+        """Prepare content for LLM without validation filtering - include ALL scraped content"""
+        # No content validation - include all results that have any content
+        valid_results = [result for result in search_results if result.content.strip()]
         if not valid_results:
             return f"""Query: "{query}"
+No content was successfully scraped from the search results. This might be due to anti-bot protections or network issues."""
         content_parts = [f'User Query: "{query}"\n']
+        content_parts.append(f"Number of sources with content: {len(valid_results)}\n")
         for i, result in enumerate(valid_results, 1):
             content_parts.append(f"=== SOURCE {i} ===")
             if result.snippet and not result.content.startswith(result.snippet[:50]):
                 content_parts.append(f"Snippet: {result.snippet}")
+            # Include FULL content without truncation - let the LLM handle the large context
             content = result.content.strip()
             content_parts.append(f"Content: {content}")
             content_parts.append("")  # Empty line between sources
         return "\n".join(content_parts)
     async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
+                                temperature: float = 0.3, max_tokens: int = 8000) -> str:
+        """Enhanced Groq summarization with increased token limits and no content filtering"""
         if not self.groq_api_key:
             return "Groq API key not provided"
         try:
+            # Prepare content without validation filtering
             prepared_content = self.prepare_content_for_llm(query, search_results)
             # Debug output
             print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
             print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
+            print(f"DEBUG - Max completion tokens: {max_tokens}")
             user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
 {prepared_content}
 Instructions:
+- Focus on information relevant to the query: "{query}"
+- Analyze ALL provided content thoroughly
 - Be specific and factual, include dates/numbers when available
 - Mention source publications when referencing information
+- If results contain limited relevant information, state this clearly but still extract what you can
+- Provide a comprehensive analysis of all available content"""
             headers = {
                 "Authorization": f"Bearer {self.openrouter_api_key}",
                         result = await response.json()
                         summary = result["choices"][0]["message"]["content"]
+                        # Add debug info
+                        debug_info = f"\n\n[Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
                         return summary + debug_info
                     else:
                 max_successful=target_successful
             )
+            # Include ALL results with any content (no filtering)
+            results_with_content = [r for r in scraped_results if r.content.strip()]
+            status_updates.append(f"Successfully scraped {len(results_with_content)} articles with content")
             # Debug: Show what content we actually got
             for i, result in enumerate(results_with_content[:3]):
             if not results_with_content:
                 return "No relevant results found after filtering", "\n".join(status_updates)
+            # Step 5: LLM Summarization - now sends ALL content without validation filtering
+            status_updates.append(f"🤖 Generating summary using {model} (processing all scraped content)...")
             try:
                 if model.startswith("Groq"):
             metadata += f"- Search engines: {', '.join(search_engines)}\n"
             metadata += f"- Model: {model}\n"
             metadata += f"- Embeddings used: {use_embeddings}\n"
+            metadata += f"- Content filtering: DISABLED (all content sent to LLM)\n"
             final_summary = summary + metadata
             status_updates.append(f"✅ Summary generated in {processing_time:.2f}s")
             yield "🧠 Filtering results using embeddings..."
             await asyncio.sleep(0.1)
+        yield "🤖 Generating AI-powered summary (processing all scraped content)..."
         await asyncio.sleep(0.1)
         # Perform the actual search and summarization
             info="Number of search results to fetch from each engine"
         ),
         gr.Slider(
+            minimum=1000,
+            maximum=8000,
+            value=8000,
+            step=500,
+            label="📝 Max Completion Tokens",
+            info="Maximum length of the AI-generated summary (Groq: up to 8000, OpenRouter: up to 4000)"
         )
     ]
         fn=chat_inference,
         additional_inputs=additional_inputs,
         additional_inputs_accordion=gr.Accordion("⚙️ Configuration & Advanced Parameters", open=True),
+        title="🔍 AI-Powered Search Engine - No Content Filtering",
         description="""
         **Search across Google, Bing, and Yahoo, then get AI-powered summaries!**
         ✨ **Features:** Multi-engine search • Query enhancement • Parallel scraping • AI summarization • Embedding filtering
+        🚀 **Updated:** All scraped content is now sent to the LLM without filtering • Increased Groq token limits (up to 8K)
         📋 **Quick Start:** 1) Add your API key below 2) Select search engines 3) Ask any question!
         """,
         cache_examples=False,
         submit_btn="🔍 Search & Summarize",
         stop_btn="⏹️ Stop",
         chatbot=gr.Chatbot(
             show_copy_button=True,
             layout="bubble",
             height=600,
+            placeholder="🚀 Ready to search! All scraped content will be sent to the LLM for comprehensive analysis.",
             show_share_button=True
         ),
         theme=gr.themes.Soft(),
 if __name__ == "__main__":
     demo = create_gradio_interface()
+    demo.launch(share=True)