Spaces:

Guiyom
/

raindropseek

Sleeping

App Files Files Community

Guiyom commited on Jan 12, 2025

Commit

3e17624

verified ·

1 Parent(s): 5dc7b85

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -16

app.py CHANGED Viewed

@@ -35,16 +35,43 @@ class RaindropSearchBot:
         self.newsapi = NewsApiClient(api_key=self.newsapi_key)
     def extract_content_from_url(self, url: str) -> Optional[str]:
-        """Extract main content from a URL using newspaper3k."""
         try:
-            article = Article(url)
-            article.download()
-            time.sleep(1)  # Polite delay between requests
-            article.parse()
-            # Combine title and text
-            content = f"{article.title}\n\n{article.text}"
-            return content if content.strip() else None
         except Exception as e:
             logger.error(f"Error extracting content from {url}: {e}")
@@ -57,27 +84,33 @@ class RaindropSearchBot:
             url = item.get('link') or item.get('url')
             if not url:
                 return item
             # For Raindrop items, use existing excerpt if available
             if source_type == 'raindrop' and item.get('excerpt'):
                 content = item['excerpt']
             else:
                 content = self.extract_content_from_url(url)
             if not content:
                 return item
             # Generate summary focused on the query topic
             try:
                 prompt = f"""
-                Analyze this content and provide a detailed summary focusing on key points relevant
-                to our topic. Include specific details, data, and quotes if relevant.
                 Content: {content[:4000]}  # Limit content length for token constraints
-                Provide a concise but detailed summary in 2-3 paragraphs.
                 """
                 response = self.client.chat.completions.create(
                     model="gpt-4o-mini",
                     messages=[{"role": "user", "content": prompt}],
@@ -86,10 +119,12 @@ class RaindropSearchBot:
                 )
                 item['detailed_summary'] = response.choices[0].message.content
             except Exception as e:
                 logger.error(f"Error generating summary: {e}")
                 item['detailed_summary'] = "Summary generation failed."
             return item
         except Exception as e:

         self.newsapi = NewsApiClient(api_key=self.newsapi_key)
     def extract_content_from_url(self, url: str) -> Optional[str]:
+        """Extract main content from a URL using BeautifulSoup."""
         try:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            }
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
+                element.decompose()
+            # Get title
+            title = soup.title.string if soup.title else ''
+            # Get main content
+            # First try common content containers
+            content_containers = soup.select('article, main, .content, .post-content, .entry-content')
+            if content_containers:
+                content = content_containers[0].get_text(separator='\n', strip=True)
+            else:
+                # Fallback to all paragraphs
+                paragraphs = soup.find_all('p')
+                content = '\n'.join(p.get_text(strip=True) for p in paragraphs)
+            # Combine and clean
+            full_content = f"{title}\n\n{content}"
+            # Clean up the text
+            full_content = re.sub(r'\n\s*\n', '\n\n', full_content)  # Remove extra newlines
+            full_content = re.sub(r'\s+', ' ', full_content)  # Normalize whitespace
+            return full_content if full_content.strip() else None
         except Exception as e:
             logger.error(f"Error extracting content from {url}: {e}")
             url = item.get('link') or item.get('url')
             if not url:
                 return item
             # For Raindrop items, use existing excerpt if available
             if source_type == 'raindrop' and item.get('excerpt'):
                 content = item['excerpt']
             else:
                 content = self.extract_content_from_url(url)
             if not content:
+                logger.warning(f"No content extracted from {url}")
+                item['detailed_summary'] = "Content extraction failed."
                 return item
             # Generate summary focused on the query topic
             try:
                 prompt = f"""
+                Analyze this content and provide a detailed summary focusing on key points.
                 Content: {content[:4000]}  # Limit content length for token constraints
+                Requirements:
+                1. Focus on the most important facts and findings
+                2. Include specific data points and quotes if relevant
+                3. Organize the information logically
+                4. Keep the summary to 2-3 paragraphs
+                5. Highlight any unique insights from this source
                 """
                 response = self.client.chat.completions.create(
                     model="gpt-4o-mini",
                     messages=[{"role": "user", "content": prompt}],
                 )
                 item['detailed_summary'] = response.choices[0].message.content
+                item['processed_content'] = content[:1000]  # Store truncated content for later use
             except Exception as e:
                 logger.error(f"Error generating summary: {e}")
                 item['detailed_summary'] = "Summary generation failed."
             return item
         except Exception as e: