Spaces:

Guiyom
/

raindropseek

Sleeping

App Files Files Community

Guiyom commited on Jan 12, 2025

Commit

a3d7f9f

verified ·

1 Parent(s): 20c30a6

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -47

app.py CHANGED Viewed

@@ -4,7 +4,9 @@ import requests
 import json
 import os
 import logging
-from typing import Dict, List
 from datetime import datetime
 from bs4 import BeautifulSoup
 from googlesearch import search
@@ -32,6 +34,68 @@ class RaindropSearchBot:
         self.client = OpenAI(api_key=self.openai_api_key)
         self.newsapi = NewsApiClient(api_key=self.newsapi_key)
     def get_google_results(self, query: str, num_results: int = 5) -> List[Dict]:
         """Get Google search results using googlesearch-python."""
         try:
@@ -210,59 +274,139 @@ class RaindropSearchBot:
             logger.error(f"Analysis generation error: {e}")
             return "Error generating analysis."
-    def format_results(self, results: List[Dict], google_results: List[Dict],
-                      news_results: List[Dict], analysis: str) -> str:
-        """Format all search results with analysis."""
-        output = f"{analysis}\n\n"
-        output += "-------\n\n"
         # Format Raindrop results
-        if results:
-            output += "🔍 Bookmarked Sources:\n\n"
-            for idx, item in enumerate(results, 1):
-                if item.get('title') or item.get('link'):
-                    output += f"{idx}. {item.get('title', 'No Title')}\n"
-                    if item.get('link'):
-                        output += f"   Link: {item['link']}\n"
-                    if item.get('tags'):
-                        output += f"   Tags: {', '.join(item['tags'])}\n"
-                    if item.get('excerpt'):
-                        output += f"   Description: {item['excerpt'][:200]}...\n"
-                    if item.get('created'):
-                        created_date = item['created'][:10]
-                        output += f"   Created: {created_date}\n"
-                    output += "\n"
         # Format Google results
         if google_results:
-            output += "🌐 Web Sources:\n\n"
-            for idx, item in enumerate(google_results, 1):
-                output += f"{idx}. {item.get('title', 'No Title')}\n"
-                if item.get('link'):
-                    output += f"   Link: {item['link']}\n"
-                if item.get('snippet'):
-                    output += f"   Description: {item['snippet']}\n"
-                output += "\n"
         # Format News results
         if news_results:
-            output += "📰 Recent News:\n\n"
-            for idx, item in enumerate(news_results, 1):
-                output += f"{idx}. {item.get('title', 'No Title')}\n"
-                if item.get('url'):
-                    output += f"   Link: {item['url']}\n"
-                if item.get('description'):
-                    output += f"   Description: {item['description']}\n"
-                if item.get('publishedAt'):
-                    output += f"   Published: {item['publishedAt'][:10]}\n"
                 if item.get('source', {}).get('name'):
-                    output += f"   Source: {item['source']['name']}\n"
-                output += "\n"
         return output
     def process_request(self, user_request: str) -> str:
-        """Process the user request with enhanced error handling."""
         try:
             logger.info(f"Processing request: {user_request}")
@@ -275,15 +419,16 @@ class RaindropSearchBot:
             google_results = self.get_google_results(search_query)
             news_results = self.get_news_results(search_query)
-            logger.info(f"Found {len(raindrop_results)} Raindrop results")
-            logger.info(f"Found {len(google_results)} Google results")
-            logger.info(f"Found {len(news_results)} News results")
-            # Generate analysis
-            analysis = self.analyze_results(raindrop_results, google_results, news_results, user_request)
             # Format and return results
-            return self.format_results(raindrop_results, google_results, news_results, analysis)
         except Exception as e:
             logger.error(f"Error processing request: {e}", exc_info=True)

 import json
 import os
 import logging
+from typing import Dict, List, Tuple, Optional
+from newspaper import Article
+import markdown
 from datetime import datetime
 from bs4 import BeautifulSoup
 from googlesearch import search
         self.client = OpenAI(api_key=self.openai_api_key)
         self.newsapi = NewsApiClient(api_key=self.newsapi_key)
+    def extract_content_from_url(self, url: str) -> Optional[str]:
+        """Extract main content from a URL using newspaper3k."""
+        try:
+            article = Article(url)
+            article.download()
+            time.sleep(1)  # Polite delay between requests
+            article.parse()
+            # Combine title and text
+            content = f"{article.title}\n\n{article.text}"
+            return content if content.strip() else None
+        except Exception as e:
+            logger.error(f"Error extracting content from {url}: {e}")
+            return None
+    def get_content_and_summary(self, item: Dict, source_type: str) -> Dict:
+        """Get content and generate summary for a single item."""
+        try:
+            # Get URL based on source type
+            url = item.get('link') or item.get('url')
+            if not url:
+                return item
+            # For Raindrop items, use existing excerpt if available
+            if source_type == 'raindrop' and item.get('excerpt'):
+                content = item['excerpt']
+            else:
+                content = self.extract_content_from_url(url)
+            if not content:
+                return item
+            # Generate summary focused on the query topic
+            try:
+                prompt = f"""
+                Analyze this content and provide a detailed summary focusing on key points relevant
+                to our topic. Include specific details, data, and quotes if relevant.
+                Content: {content[:4000]}  # Limit content length for token constraints
+                Provide a concise but detailed summary in 2-3 paragraphs.
+                """
+                response = self.client.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.3,
+                    max_tokens=300
+                )
+                item['detailed_summary'] = response.choices[0].message.content
+            except Exception as e:
+                logger.error(f"Error generating summary: {e}")
+                item['detailed_summary'] = "Summary generation failed."
+            return item
+        except Exception as e:
+            logger.error(f"Error processing item: {e}")
+            return item
     def get_google_results(self, query: str, num_results: int = 5) -> List[Dict]:
         """Get Google search results using googlesearch-python."""
         try:
             logger.error(f"Analysis generation error: {e}")
             return "Error generating analysis."
+    def format_results(self, results: Tuple[List[Dict], List[Dict], List[Dict]],
+                      essay: str) -> str:
+        """Format the essay and results with detailed summaries."""
+        raindrop_results, google_results, news_results = results
+        output = f"{essay}\n\n"
+        output += "---\n\n"
+        output += "# References and Detailed Summaries\n\n"
+        ref_counter = 1
         # Format Raindrop results
+        if raindrop_results:
+            output += "## 🔍 Bookmarked Sources\n\n"
+            for item in raindrop_results:
+                output += f"### [{ref_counter}] {item.get('title', 'No Title')}\n"
+                output += f"**Link**: {item.get('link')}\n"
+                if item.get('tags'):
+                    output += f"**Tags**: {', '.join(item['tags'])}\n"
+                if item.get('created'):
+                    output += f"**Created**: {item['created'][:10]}\n"
+                output += "\n**Summary**:\n"
+                output += f"{item.get('detailed_summary', 'No summary available.')}\n\n"
+                ref_counter += 1
         # Format Google results
         if google_results:
+            output += "## 🌐 Web Sources\n\n"
+            for item in google_results:
+                output += f"### [{ref_counter}] {item.get('title', 'No Title')}\n"
+                output += f"**Link**: {item.get('link')}\n"
+                output += "\n**Summary**:\n"
+                output += f"{item.get('detailed_summary', 'No summary available.')}\n\n"
+                ref_counter += 1
         # Format News results
         if news_results:
+            output += "## 📰 Recent News\n\n"
+            for item in news_results:
+                output += f"### [{ref_counter}] {item.get('title', 'No Title')}\n"
+                output += f"**Link**: {item.get('url')}\n"
                 if item.get('source', {}).get('name'):
+                    output += f"**Source**: {item['source']['name']}\n"
+                if item.get('publishedAt'):
+                    output += f"**Published**: {item['publishedAt'][:10]}\n"
+                output += "\n**Summary**:\n"
+                output += f"{item.get('detailed_summary', 'No summary available.')}\n\n"
+                ref_counter += 1
         return output
+    def process_all_results(self, raindrop_results: List[Dict],
+                          google_results: List[Dict],
+                          news_results: List[Dict]) -> Tuple[List[Dict], List[Dict], List[Dict]]:
+        """Process and enrich all results with content and summaries."""
+        processed_raindrop = []
+        for item in raindrop_results:
+            processed_item = self.get_content_and_summary(item, 'raindrop')
+            if processed_item.get('detailed_summary'):
+                processed_raindrop.append(processed_item)
+        processed_google = []
+        for item in google_results:
+            processed_item = self.get_content_and_summary(item, 'google')
+            if processed_item.get('detailed_summary'):
+                processed_google.append(processed_item)
+        processed_news = []
+        for item in news_results:
+            processed_item = self.get_content_and_summary(item, 'news')
+            if processed_item.get('detailed_summary'):
+                processed_news.append(processed_item)
+        return processed_raindrop, processed_google, processed_news
+    def generate_essay_response(self, results: Tuple[List[Dict], List[Dict], List[Dict]],
+                              user_query: str) -> str:
+        """Generate a structured essay-style response with references."""
+        raindrop_results, google_results, news_results = results
+        # Collect all content for analysis
+        all_content = ""
+        reference_map = {}
+        ref_counter = 1
+        for source_list in [raindrop_results, google_results, news_results]:
+            for item in source_list:
+                if item.get('detailed_summary'):
+                    all_content += f"\n{item['detailed_summary']}\n"
+                    reference_map[item['link']] = ref_counter
+                    ref_counter += 1
+        try:
+            prompt = f"""
+            Create a comprehensive essay-style analysis about: {user_query}
+            Use this content as your source material:
+            {all_content}
+            Requirements:
+            1. Structure the response in clear sections with markdown headers
+            2. Include an introduction and conclusion
+            3. Use reference numbers [n] to cite sources
+            4. Make connections between different sources
+            5. Highlight key findings and trends
+            6. Address any contradictions or gaps
+            7. Use markdown formatting for better readability
+            Format the response as a proper academic essay with sections.
+            """
+            response = self.client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.5,
+                max_tokens=1500
+            )
+            essay = response.choices[0].message.content
+            # Replace reference placeholders with actual reference numbers
+            for url, ref_num in reference_map.items():
+                essay = essay.replace(f'[URL:{url}]', f'[{ref_num}]')
+            return essay
+        except Exception as e:
+            logger.error(f"Error generating essay: {e}")
+            return "Error generating analysis."
     def process_request(self, user_request: str) -> str:
+        """Process the user request with enhanced content collection and analysis."""
         try:
             logger.info(f"Processing request: {user_request}")
             google_results = self.get_google_results(search_query)
             news_results = self.get_news_results(search_query)
+            # Process all results to get content and summaries
+            processed_results = self.process_all_results(
+                raindrop_results, google_results, news_results
+            )
+            # Generate essay-style analysis
+            essay = self.generate_essay_response(processed_results, user_request)
             # Format and return results
+            return self.format_results(processed_results, essay)
         except Exception as e:
             logger.error(f"Error processing request: {e}", exc_info=True)