import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager import time import re from urllib.parse import urljoin, urlparse import json from datetime import datetime class WebScraper: def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) self.driver = None def setup_selenium(self): """Setup Selenium WebDriver for dynamic content""" try: chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--window-size=1920,1080") self.driver = webdriver.Chrome( service=webdriver.chrome.service.Service(ChromeDriverManager().install()), options=chrome_options ) return True except Exception as e: print(f"Failed to setup Selenium: {e}") return False def close_selenium(self): """Close Selenium WebDriver""" if self.driver: self.driver.quit() self.driver = None def get_page_content(self, url, use_selenium=False): """Get page content using requests or Selenium""" try: if use_selenium and self.driver: self.driver.get(url) time.sleep(2) # Wait for dynamic content return self.driver.page_source else: response = self.session.get(url, timeout=10) response.raise_for_status() return response.text except Exception as e: print(f"Error fetching page: {e}") return None def extract_text_content(self, soup): """Extract text content from BeautifulSoup object""" text_data = { "title": "", "headings": [], "paragraphs": [], "lists": [] } # Extract title title_tag = soup.find('title') if title_tag: text_data["title"] = title_tag.get_text().strip() # Extract headings for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: headings = soup.find_all(tag) for heading in headings: text = heading.get_text().strip() if text: text_data["headings"].append({ "level": tag, "text": text }) # Extract paragraphs paragraphs = soup.find_all('p') for p in paragraphs: text = p.get_text().strip() if text and len(text) > 20: # Filter out short text text_data["paragraphs"].append(text) # Extract lists lists = soup.find_all(['ul', 'ol']) for lst in lists: items = [] for item in lst.find_all('li'): text = item.get_text().strip() if text: items.append(text) if items: text_data["lists"].append({ "type": lst.name, "items": items }) return text_data def extract_numbers(self, soup): """Extract all numbers (integers and floats) from the text content""" text = soup.get_text() # Regex to find integers and floats numbers = re.findall(r'\b\d+\.?\d*\b', text) # Convert to float for consistency, and remove duplicates return sorted(list(set([float(n) for n in numbers if n.strip()]))) def extract_images(self, soup, base_url): """Extract images from BeautifulSoup object""" images = [] img_tags = soup.find_all('img') for img in img_tags: src = img.get('src', '') alt = img.get('alt', '') title = img.get('title', '') if src: # Make relative URLs absolute if not src.startswith(('http://', 'https://')): src = urljoin(base_url, src) images.append({ "src": src, "alt": alt, "title": title, "width": img.get('width', ''), "height": img.get('height', '') }) return images def extract_links(self, soup, base_url): """Extract links from BeautifulSoup object""" links = [] link_tags = soup.find_all('a', href=True) for link in link_tags: href = link.get('href') text = link.get_text().strip() if href and text: # Make relative URLs absolute if not href.startswith(('http://', 'https://')): href = urljoin(base_url, href) # Only include external and internal links, skip anchors if not href.startswith('#'): links.append({ "href": href, "text": text, "title": link.get('title', ''), "is_external": not href.startswith(base_url) }) return links def extract_tables(self, soup): """Extract tables from BeautifulSoup object""" tables = [] table_tags = soup.find_all('table') for table in table_tags: table_data = { "headers": [], "rows": [], "caption": "" } # Extract caption caption = table.find('caption') if caption: table_data["caption"] = caption.get_text().strip() # Extract headers thead = table.find('thead') if thead: header_row = thead.find('tr') if header_row: headers = header_row.find_all(['th', 'td']) table_data["headers"] = [h.get_text().strip() for h in headers] # Extract rows tbody = table.find('tbody') or table rows = tbody.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) if cells: row_data = [cell.get_text().strip() for cell in cells] table_data["rows"].append(row_data) if table_data["rows"]: tables.append(table_data) return tables def extract_metadata(self, soup): """Extract metadata from BeautifulSoup object""" metadata = { "title": "", "description": "", "keywords": [], "author": "", "language": "en", "robots": "", "viewport": "", "charset": "" } # Extract title title_tag = soup.find('title') if title_tag: metadata["title"] = title_tag.get_text().strip() # Extract meta tags meta_tags = soup.find_all('meta') for meta in meta_tags: name = meta.get('name', '').lower() content = meta.get('content', '') property_attr = meta.get('property', '').lower() if name == 'description' or property_attr == 'og:description': metadata["description"] = content elif name == 'keywords': metadata["keywords"] = [kw.strip() for kw in content.split(',')] elif name == 'author': metadata["author"] = content elif name == 'robots': metadata["robots"] = content elif name == 'viewport': metadata["viewport"] = content elif property_attr == 'og:title': metadata["title"] = content or metadata["title"] # Extract charset charset_meta = soup.find('meta', charset=True) if charset_meta: metadata["charset"] = charset_meta.get('charset') # Extract language html_tag = soup.find('html') if html_tag: lang = html_tag.get('lang', 'en') metadata["language"] = lang return metadata def scrape_website(self, url, data_types, max_pages=1, rate_limit=2): """Main scraping function""" scraped_data = { "url": url, "timestamp": datetime.now().isoformat(), "data_types": data_types, "pages_crawled": 0, "errors": [] } try: # Setup Selenium if needed for dynamic content use_selenium = "images" in data_types or "tables" in data_types if use_selenium: if not self.setup_selenium(): scraped_data["errors"].append("Failed to setup Selenium for dynamic content") # Get page content content = self.get_page_content(url, use_selenium) if not content: scraped_data["errors"].append("Failed to fetch page content") return scraped_data # Parse with BeautifulSoup soup = BeautifulSoup(content, 'html.parser') scraped_data["pages_crawled"] = 1 # Extract data based on selected types if "text" in data_types: scraped_data["text_content"] = self.extract_text_content(soup) if "images" in data_types: scraped_data["images"] = self.extract_images(soup, url) if "links" in data_types: scraped_data["links"] = self.extract_links(soup, url) if "tables" in data_types: scraped_data["tables"] = self.extract_tables(soup) if "metadata" in data_types: scraped_data["metadata"] = self.extract_metadata(soup) if "numbers" in data_types: scraped_data["numbers"] = self.extract_numbers(soup) # Rate limiting time.sleep(rate_limit) except Exception as e: scraped_data["errors"].append(f"Scraping error: {str(e)}") finally: # Clean up Selenium if use_selenium: self.close_selenium() return scraped_data # Global scraper instance scraper = WebScraper()