Spaces:

PHOROTHA913
/

Scrape-Anythings

Sleeping

App Files Files Community

Scrape-Anythings / scraper.py

PHOROTHA913

Upload 9 files

5c3dc0d verified 5 months ago

raw

history blame contribute delete

11.3 kB

	import requests
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from webdriver_manager.chrome import ChromeDriverManager
	import time
	import re
	from urllib.parse import urljoin, urlparse
	import json
	from datetime import datetime

	class WebScraper:
	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	})
	self.driver = None

	def setup_selenium(self):
	"""Setup Selenium WebDriver for dynamic content"""
	try:
	chrome_options = Options()
	chrome_options.add_argument("--headless")
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")
	chrome_options.add_argument("--disable-gpu")
	chrome_options.add_argument("--window-size=1920,1080")

	self.driver = webdriver.Chrome(
	service=webdriver.chrome.service.Service(ChromeDriverManager().install()),
	options=chrome_options
	)
	return True
	except Exception as e:
	print(f"Failed to setup Selenium: {e}")
	return False

	def close_selenium(self):
	"""Close Selenium WebDriver"""
	if self.driver:
	self.driver.quit()
	self.driver = None

	def get_page_content(self, url, use_selenium=False):
	"""Get page content using requests or Selenium"""
	try:
	if use_selenium and self.driver:
	self.driver.get(url)
	time.sleep(2) # Wait for dynamic content
	return self.driver.page_source
	else:
	response = self.session.get(url, timeout=10)
	response.raise_for_status()
	return response.text
	except Exception as e:
	print(f"Error fetching page: {e}")
	return None

	def extract_text_content(self, soup):
	"""Extract text content from BeautifulSoup object"""
	text_data = {
	"title": "",
	"headings": [],
	"paragraphs": [],
	"lists": []
	}

	# Extract title
	title_tag = soup.find('title')
	if title_tag:
	text_data["title"] = title_tag.get_text().strip()

	# Extract headings
	for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
	headings = soup.find_all(tag)
	for heading in headings:
	text = heading.get_text().strip()
	if text:
	text_data["headings"].append({
	"level": tag,
	"text": text
	})

	# Extract paragraphs
	paragraphs = soup.find_all('p')
	for p in paragraphs:
	text = p.get_text().strip()
	if text and len(text) > 20: # Filter out short text
	text_data["paragraphs"].append(text)

	# Extract lists
	lists = soup.find_all(['ul', 'ol'])
	for lst in lists:
	items = []
	for item in lst.find_all('li'):
	text = item.get_text().strip()
	if text:
	items.append(text)
	if items:
	text_data["lists"].append({
	"type": lst.name,
	"items": items
	})

	return text_data

	def extract_numbers(self, soup):
	"""Extract all numbers (integers and floats) from the text content"""
	text = soup.get_text()
	# Regex to find integers and floats
	numbers = re.findall(r'\b\d+\.?\d*\b', text)
	# Convert to float for consistency, and remove duplicates
	return sorted(list(set([float(n) for n in numbers if n.strip()])))

	def extract_images(self, soup, base_url):
	"""Extract images from BeautifulSoup object"""
	images = []
	img_tags = soup.find_all('img')

	for img in img_tags:
	src = img.get('src', '')
	alt = img.get('alt', '')
	title = img.get('title', '')

	if src:
	# Make relative URLs absolute
	if not src.startswith(('http://', 'https://')):
	src = urljoin(base_url, src)

	images.append({
	"src": src,
	"alt": alt,
	"title": title,
	"width": img.get('width', ''),
	"height": img.get('height', '')
	})

	return images

	def extract_links(self, soup, base_url):
	"""Extract links from BeautifulSoup object"""
	links = []
	link_tags = soup.find_all('a', href=True)

	for link in link_tags:
	href = link.get('href')
	text = link.get_text().strip()

	if href and text:
	# Make relative URLs absolute
	if not href.startswith(('http://', 'https://')):
	href = urljoin(base_url, href)

	# Only include external and internal links, skip anchors
	if not href.startswith('#'):
	links.append({
	"href": href,
	"text": text,
	"title": link.get('title', ''),
	"is_external": not href.startswith(base_url)
	})

	return links

	def extract_tables(self, soup):
	"""Extract tables from BeautifulSoup object"""
	tables = []
	table_tags = soup.find_all('table')

	for table in table_tags:
	table_data = {
	"headers": [],
	"rows": [],
	"caption": ""
	}

	# Extract caption
	caption = table.find('caption')
	if caption:
	table_data["caption"] = caption.get_text().strip()

	# Extract headers
	thead = table.find('thead')
	if thead:
	header_row = thead.find('tr')
	if header_row:
	headers = header_row.find_all(['th', 'td'])
	table_data["headers"] = [h.get_text().strip() for h in headers]

	# Extract rows
	tbody = table.find('tbody') or table
	rows = tbody.find_all('tr')

	for row in rows:
	cells = row.find_all(['td', 'th'])
	if cells:
	row_data = [cell.get_text().strip() for cell in cells]
	table_data["rows"].append(row_data)

	if table_data["rows"]:
	tables.append(table_data)

	return tables

	def extract_metadata(self, soup):
	"""Extract metadata from BeautifulSoup object"""
	metadata = {
	"title": "",
	"description": "",
	"keywords": [],
	"author": "",
	"language": "en",
	"robots": "",
	"viewport": "",
	"charset": ""
	}

	# Extract title
	title_tag = soup.find('title')
	if title_tag:
	metadata["title"] = title_tag.get_text().strip()

	# Extract meta tags
	meta_tags = soup.find_all('meta')
	for meta in meta_tags:
	name = meta.get('name', '').lower()
	content = meta.get('content', '')
	property_attr = meta.get('property', '').lower()

	if name == 'description' or property_attr == 'og:description':
	metadata["description"] = content
	elif name == 'keywords':
	metadata["keywords"] = [kw.strip() for kw in content.split(',')]
	elif name == 'author':
	metadata["author"] = content
	elif name == 'robots':
	metadata["robots"] = content
	elif name == 'viewport':
	metadata["viewport"] = content
	elif property_attr == 'og:title':
	metadata["title"] = content or metadata["title"]

	# Extract charset
	charset_meta = soup.find('meta', charset=True)
	if charset_meta:
	metadata["charset"] = charset_meta.get('charset')

	# Extract language
	html_tag = soup.find('html')
	if html_tag:
	lang = html_tag.get('lang', 'en')
	metadata["language"] = lang

	return metadata

	def scrape_website(self, url, data_types, max_pages=1, rate_limit=2):
	"""Main scraping function"""
	scraped_data = {
	"url": url,
	"timestamp": datetime.now().isoformat(),
	"data_types": data_types,
	"pages_crawled": 0,
	"errors": []
	}

	try:
	# Setup Selenium if needed for dynamic content
	use_selenium = "images" in data_types or "tables" in data_types
	if use_selenium:
	if not self.setup_selenium():
	scraped_data["errors"].append("Failed to setup Selenium for dynamic content")

	# Get page content
	content = self.get_page_content(url, use_selenium)
	if not content:
	scraped_data["errors"].append("Failed to fetch page content")
	return scraped_data

	# Parse with BeautifulSoup
	soup = BeautifulSoup(content, 'html.parser')
	scraped_data["pages_crawled"] = 1

	# Extract data based on selected types
	if "text" in data_types:
	scraped_data["text_content"] = self.extract_text_content(soup)

	if "images" in data_types:
	scraped_data["images"] = self.extract_images(soup, url)

	if "links" in data_types:
	scraped_data["links"] = self.extract_links(soup, url)

	if "tables" in data_types:
	scraped_data["tables"] = self.extract_tables(soup)

	if "metadata" in data_types:
	scraped_data["metadata"] = self.extract_metadata(soup)

	if "numbers" in data_types:
	scraped_data["numbers"] = self.extract_numbers(soup)

	# Rate limiting
	time.sleep(rate_limit)

	except Exception as e:
	scraped_data["errors"].append(f"Scraping error: {str(e)}")

	finally:
	# Clean up Selenium
	if use_selenium:
	self.close_selenium()

	return scraped_data

	# Global scraper instance
	scraper = WebScraper()