Spaces:

PHOROTHA913
/

Scrape-Anythings

Sleeping

File size: 11,294 Bytes

5c3dc0d

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
from urllib.parse import urljoin, urlparse
import json
from datetime import datetime

class WebScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.driver = None
    
    def setup_selenium(self):
        """Setup Selenium WebDriver for dynamic content"""
        try:
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            
            self.driver = webdriver.Chrome(
                service=webdriver.chrome.service.Service(ChromeDriverManager().install()),
                options=chrome_options
            )
            return True
        except Exception as e:
            print(f"Failed to setup Selenium: {e}")
            return False
    
    def close_selenium(self):
        """Close Selenium WebDriver"""
        if self.driver:
            self.driver.quit()
            self.driver = None
    
    def get_page_content(self, url, use_selenium=False):
        """Get page content using requests or Selenium"""
        try:
            if use_selenium and self.driver:
                self.driver.get(url)
                time.sleep(2)  # Wait for dynamic content
                return self.driver.page_source
            else:
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                return response.text
        except Exception as e:
            print(f"Error fetching page: {e}")
            return None
    
    def extract_text_content(self, soup):
        """Extract text content from BeautifulSoup object"""
        text_data = {
            "title": "",
            "headings": [],
            "paragraphs": [],
            "lists": []
        }
        
        # Extract title
        title_tag = soup.find('title')
        if title_tag:
            text_data["title"] = title_tag.get_text().strip()
        
        # Extract headings
        for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            headings = soup.find_all(tag)
            for heading in headings:
                text = heading.get_text().strip()
                if text:
                    text_data["headings"].append({
                        "level": tag,
                        "text": text
                    })
        
        # Extract paragraphs
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            text = p.get_text().strip()
            if text and len(text) > 20:  # Filter out short text
                text_data["paragraphs"].append(text)
        
        # Extract lists
        lists = soup.find_all(['ul', 'ol'])
        for lst in lists:
            items = []
            for item in lst.find_all('li'):
                text = item.get_text().strip()
                if text:
                    items.append(text)
            if items:
                text_data["lists"].append({
                    "type": lst.name,
                    "items": items
                })
        
        return text_data

    def extract_numbers(self, soup):
        """Extract all numbers (integers and floats) from the text content"""
        text = soup.get_text()
        # Regex to find integers and floats
        numbers = re.findall(r'\b\d+\.?\d*\b', text)
        # Convert to float for consistency, and remove duplicates
        return sorted(list(set([float(n) for n in numbers if n.strip()])))
    
    def extract_images(self, soup, base_url):
        """Extract images from BeautifulSoup object"""
        images = []
        img_tags = soup.find_all('img')
        
        for img in img_tags:
            src = img.get('src', '')
            alt = img.get('alt', '')
            title = img.get('title', '')
            
            if src:
                # Make relative URLs absolute
                if not src.startswith(('http://', 'https://')):
                    src = urljoin(base_url, src)
                
                images.append({
                    "src": src,
                    "alt": alt,
                    "title": title,
                    "width": img.get('width', ''),
                    "height": img.get('height', '')
                })
        
        return images
    
    def extract_links(self, soup, base_url):
        """Extract links from BeautifulSoup object"""
        links = []
        link_tags = soup.find_all('a', href=True)
        
        for link in link_tags:
            href = link.get('href')
            text = link.get_text().strip()
            
            if href and text:
                # Make relative URLs absolute
                if not href.startswith(('http://', 'https://')):
                    href = urljoin(base_url, href)
                
                # Only include external and internal links, skip anchors
                if not href.startswith('#'):
                    links.append({
                        "href": href,
                        "text": text,
                        "title": link.get('title', ''),
                        "is_external": not href.startswith(base_url)
                    })
        
        return links
    
    def extract_tables(self, soup):
        """Extract tables from BeautifulSoup object"""
        tables = []
        table_tags = soup.find_all('table')
        
        for table in table_tags:
            table_data = {
                "headers": [],
                "rows": [],
                "caption": ""
            }
            
            # Extract caption
            caption = table.find('caption')
            if caption:
                table_data["caption"] = caption.get_text().strip()
            
            # Extract headers
            thead = table.find('thead')
            if thead:
                header_row = thead.find('tr')
                if header_row:
                    headers = header_row.find_all(['th', 'td'])
                    table_data["headers"] = [h.get_text().strip() for h in headers]
            
            # Extract rows
            tbody = table.find('tbody') or table
            rows = tbody.find_all('tr')
            
            for row in rows:
                cells = row.find_all(['td', 'th'])
                if cells:
                    row_data = [cell.get_text().strip() for cell in cells]
                    table_data["rows"].append(row_data)
            
            if table_data["rows"]:
                tables.append(table_data)
        
        return tables
    
    def extract_metadata(self, soup):
        """Extract metadata from BeautifulSoup object"""
        metadata = {
            "title": "",
            "description": "",
            "keywords": [],
            "author": "",
            "language": "en",
            "robots": "",
            "viewport": "",
            "charset": ""
        }
        
        # Extract title
        title_tag = soup.find('title')
        if title_tag:
            metadata["title"] = title_tag.get_text().strip()
        
        # Extract meta tags
        meta_tags = soup.find_all('meta')
        for meta in meta_tags:
            name = meta.get('name', '').lower()
            content = meta.get('content', '')
            property_attr = meta.get('property', '').lower()
            
            if name == 'description' or property_attr == 'og:description':
                metadata["description"] = content
            elif name == 'keywords':
                metadata["keywords"] = [kw.strip() for kw in content.split(',')]
            elif name == 'author':
                metadata["author"] = content
            elif name == 'robots':
                metadata["robots"] = content
            elif name == 'viewport':
                metadata["viewport"] = content
            elif property_attr == 'og:title':
                metadata["title"] = content or metadata["title"]
        
        # Extract charset
        charset_meta = soup.find('meta', charset=True)
        if charset_meta:
            metadata["charset"] = charset_meta.get('charset')
        
        # Extract language
        html_tag = soup.find('html')
        if html_tag:
            lang = html_tag.get('lang', 'en')
            metadata["language"] = lang
        
        return metadata
    
    def scrape_website(self, url, data_types, max_pages=1, rate_limit=2):
        """Main scraping function"""
        scraped_data = {
            "url": url,
            "timestamp": datetime.now().isoformat(),
            "data_types": data_types,
            "pages_crawled": 0,
            "errors": []
        }
        
        try:
            # Setup Selenium if needed for dynamic content
            use_selenium = "images" in data_types or "tables" in data_types
            if use_selenium:
                if not self.setup_selenium():
                    scraped_data["errors"].append("Failed to setup Selenium for dynamic content")
            
            # Get page content
            content = self.get_page_content(url, use_selenium)
            if not content:
                scraped_data["errors"].append("Failed to fetch page content")
                return scraped_data
            
            # Parse with BeautifulSoup
            soup = BeautifulSoup(content, 'html.parser')
            scraped_data["pages_crawled"] = 1
            
            # Extract data based on selected types
            if "text" in data_types:
                scraped_data["text_content"] = self.extract_text_content(soup)
            
            if "images" in data_types:
                scraped_data["images"] = self.extract_images(soup, url)
            
            if "links" in data_types:
                scraped_data["links"] = self.extract_links(soup, url)
            
            if "tables" in data_types:
                scraped_data["tables"] = self.extract_tables(soup)
            
            if "metadata" in data_types:
                scraped_data["metadata"] = self.extract_metadata(soup)

            if "numbers" in data_types:
                scraped_data["numbers"] = self.extract_numbers(soup)
            
            # Rate limiting
            time.sleep(rate_limit)
            
        except Exception as e:
            scraped_data["errors"].append(f"Scraping error: {str(e)}")
        
        finally:
            # Clean up Selenium
            if use_selenium:
                self.close_selenium()
        
        return scraped_data

# Global scraper instance
scraper = WebScraper()