Sentinel-AI-2.0

Sleeping

App Files Files Community

Shreyas094 commited on Oct 2, 2024

Commit

1a81bf1

verified ·

1 Parent(s): 6400d84

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -70

app.py CHANGED Viewed

@@ -69,7 +69,7 @@ def is_valid_url(url):
     except ValueError:
         return False
-def scrape_with_bs4(url, session):
     try:
         response = session.get(url, timeout=5)
         response.raise_for_status()
@@ -78,20 +78,20 @@ def scrape_with_bs4(url, session):
         main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
         if main_content:
-            content = main_content.get_text(strip=True)
         else:
-            content = soup.get_text(strip=True)
-        return content
     except Exception as e:
         logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
         return ""
-def scrape_with_trafilatura(url):
     try:
         downloaded = fetch_url(url)
-        content = extract(downloaded)
-        return content or ""
     except Exception as e:
         logger.error(f"Error scraping {url} with Trafilatura: {e}")
         return ""
@@ -371,72 +371,71 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
                 break
             for result in results:
-                if len(scraped_content) >= num_results:
-                    break
-                url = result.get('url', '')
-                title = result.get('title', 'No title')
-                if not is_valid_url(url):
-                    logger.warning(f"Invalid URL: {url}")
-                    continue
-                try:
-                    logger.info(f"Scraping content from: {url}")
-                    # Implement a retry mechanism with different user agents
-                    user_agents = [
-                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
-                        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-                    ]
-                    content = ""
-                    for ua in user_agents:
-                        try:
-                            if scraper == "bs4":
-                                session.headers.update({'User-Agent': ua})
-                                content = scrape_with_bs4(url, session)
-                            else:  # trafilatura
-                                # Use urllib to handle custom headers for trafilatura
-                                req = Request(url, headers={'User-Agent': ua})
-                                with urlopen(req) as response:
-                                    downloaded = response.read()
-                                # Configure trafilatura to use a specific user agent
-                                config = use_config()
-                                config.set("DEFAULT", "USER_AGENT", ua)
-                                content = extract(downloaded, config=config)
-                            if content:
-                                break
-                        except requests.exceptions.HTTPError as e:
-                            if e.response.status_code == 403:
-                                logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
-                                continue
-                            else:
-                                raise
-                        except Exception as e:
-                            logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
                             continue
-                    if not content:
-                        logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
                         continue
-                    # Limit content to max_chars
-                    scraped_content.append({
-                        "title": title,
-                        "url": url,
-                        "content": content[:max_chars],
-                        "scraper": scraper
-                    })
-                    logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
-                except requests.exceptions.RequestException as e:
-                    logger.error(f"Error scraping {url}: {e}")
-                except Exception as e:
-                    logger.error(f"Unexpected error while scraping {url}: {e}")
             page += 1

     except ValueError:
         return False
+def scrape_with_bs4(url, session, max_chars=None):
     try:
         response = session.get(url, timeout=5)
         response.raise_for_status()
         main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
         if main_content:
+            content = main_content.get_text(strip=True, separator='\n')
         else:
+            content = soup.get_text(strip=True, separator='\n')
+        return content[:max_chars] if max_chars else content
     except Exception as e:
         logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
         return ""
+def scrape_with_trafilatura(url, max_chars=None):
     try:
         downloaded = fetch_url(url)
+        content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
+        return (content or "")[:max_chars] if max_chars else (content or "")
     except Exception as e:
         logger.error(f"Error scraping {url} with Trafilatura: {e}")
         return ""
                 break
             for result in results:
+            if len(scraped_content) >= num_results:
+                break
+            url = result.get('url', '')
+            title = result.get('title', 'No title')
+            if not is_valid_url(url):
+                logger.warning(f"Invalid URL: {url}")
+                continue
+            try:
+                logger.info(f"Scraping content from: {url}")
+                # Implement a retry mechanism with different user agents
+                user_agents = [
+                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
+                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+                ]
+                content = ""
+                for ua in user_agents:
+                    try:
+                        if scraper == "bs4":
+                            session.headers.update({'User-Agent': ua})
+                            content = scrape_with_bs4(url, session, max_chars)
+                        else:  # trafilatura
+                            # Use urllib to handle custom headers for trafilatura
+                            req = Request(url, headers={'User-Agent': ua})
+                            with urlopen(req) as response:
+                                downloaded = response.read()
+                            # Configure trafilatura to use a specific user agent
+                            config = use_config()
+                            config.set("DEFAULT", "USER_AGENT", ua)
+                            content = scrape_with_trafilatura(url, max_chars)
+                        if content:
+                            break
+                    except requests.exceptions.HTTPError as e:
+                        if e.response.status_code == 403:
+                            logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
                             continue
+                        else:
+                            raise
+                    except Exception as e:
+                        logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
                         continue
+                if not content:
+                    logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
+                    continue
+                scraped_content.append({
+                    "title": title,
+                    "url": url,
+                    "content": content,  # No need to slice here as it's already limited
+                    "scraper": scraper
+                })
+                logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Error scraping {url}: {e}")
+            except Exception as e:
+                logger.error(f"Unexpected error while scraping {url}: {e}")
             page += 1