Scrape-Anythings / scraper.py
PHOROTHA913's picture
Upload 9 files
5c3dc0d verified
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
from urllib.parse import urljoin, urlparse
import json
from datetime import datetime
class WebScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.driver = None
def setup_selenium(self):
"""Setup Selenium WebDriver for dynamic content"""
try:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
self.driver = webdriver.Chrome(
service=webdriver.chrome.service.Service(ChromeDriverManager().install()),
options=chrome_options
)
return True
except Exception as e:
print(f"Failed to setup Selenium: {e}")
return False
def close_selenium(self):
"""Close Selenium WebDriver"""
if self.driver:
self.driver.quit()
self.driver = None
def get_page_content(self, url, use_selenium=False):
"""Get page content using requests or Selenium"""
try:
if use_selenium and self.driver:
self.driver.get(url)
time.sleep(2) # Wait for dynamic content
return self.driver.page_source
else:
response = self.session.get(url, timeout=10)
response.raise_for_status()
return response.text
except Exception as e:
print(f"Error fetching page: {e}")
return None
def extract_text_content(self, soup):
"""Extract text content from BeautifulSoup object"""
text_data = {
"title": "",
"headings": [],
"paragraphs": [],
"lists": []
}
# Extract title
title_tag = soup.find('title')
if title_tag:
text_data["title"] = title_tag.get_text().strip()
# Extract headings
for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
headings = soup.find_all(tag)
for heading in headings:
text = heading.get_text().strip()
if text:
text_data["headings"].append({
"level": tag,
"text": text
})
# Extract paragraphs
paragraphs = soup.find_all('p')
for p in paragraphs:
text = p.get_text().strip()
if text and len(text) > 20: # Filter out short text
text_data["paragraphs"].append(text)
# Extract lists
lists = soup.find_all(['ul', 'ol'])
for lst in lists:
items = []
for item in lst.find_all('li'):
text = item.get_text().strip()
if text:
items.append(text)
if items:
text_data["lists"].append({
"type": lst.name,
"items": items
})
return text_data
def extract_numbers(self, soup):
"""Extract all numbers (integers and floats) from the text content"""
text = soup.get_text()
# Regex to find integers and floats
numbers = re.findall(r'\b\d+\.?\d*\b', text)
# Convert to float for consistency, and remove duplicates
return sorted(list(set([float(n) for n in numbers if n.strip()])))
def extract_images(self, soup, base_url):
"""Extract images from BeautifulSoup object"""
images = []
img_tags = soup.find_all('img')
for img in img_tags:
src = img.get('src', '')
alt = img.get('alt', '')
title = img.get('title', '')
if src:
# Make relative URLs absolute
if not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
images.append({
"src": src,
"alt": alt,
"title": title,
"width": img.get('width', ''),
"height": img.get('height', '')
})
return images
def extract_links(self, soup, base_url):
"""Extract links from BeautifulSoup object"""
links = []
link_tags = soup.find_all('a', href=True)
for link in link_tags:
href = link.get('href')
text = link.get_text().strip()
if href and text:
# Make relative URLs absolute
if not href.startswith(('http://', 'https://')):
href = urljoin(base_url, href)
# Only include external and internal links, skip anchors
if not href.startswith('#'):
links.append({
"href": href,
"text": text,
"title": link.get('title', ''),
"is_external": not href.startswith(base_url)
})
return links
def extract_tables(self, soup):
"""Extract tables from BeautifulSoup object"""
tables = []
table_tags = soup.find_all('table')
for table in table_tags:
table_data = {
"headers": [],
"rows": [],
"caption": ""
}
# Extract caption
caption = table.find('caption')
if caption:
table_data["caption"] = caption.get_text().strip()
# Extract headers
thead = table.find('thead')
if thead:
header_row = thead.find('tr')
if header_row:
headers = header_row.find_all(['th', 'td'])
table_data["headers"] = [h.get_text().strip() for h in headers]
# Extract rows
tbody = table.find('tbody') or table
rows = tbody.find_all('tr')
for row in rows:
cells = row.find_all(['td', 'th'])
if cells:
row_data = [cell.get_text().strip() for cell in cells]
table_data["rows"].append(row_data)
if table_data["rows"]:
tables.append(table_data)
return tables
def extract_metadata(self, soup):
"""Extract metadata from BeautifulSoup object"""
metadata = {
"title": "",
"description": "",
"keywords": [],
"author": "",
"language": "en",
"robots": "",
"viewport": "",
"charset": ""
}
# Extract title
title_tag = soup.find('title')
if title_tag:
metadata["title"] = title_tag.get_text().strip()
# Extract meta tags
meta_tags = soup.find_all('meta')
for meta in meta_tags:
name = meta.get('name', '').lower()
content = meta.get('content', '')
property_attr = meta.get('property', '').lower()
if name == 'description' or property_attr == 'og:description':
metadata["description"] = content
elif name == 'keywords':
metadata["keywords"] = [kw.strip() for kw in content.split(',')]
elif name == 'author':
metadata["author"] = content
elif name == 'robots':
metadata["robots"] = content
elif name == 'viewport':
metadata["viewport"] = content
elif property_attr == 'og:title':
metadata["title"] = content or metadata["title"]
# Extract charset
charset_meta = soup.find('meta', charset=True)
if charset_meta:
metadata["charset"] = charset_meta.get('charset')
# Extract language
html_tag = soup.find('html')
if html_tag:
lang = html_tag.get('lang', 'en')
metadata["language"] = lang
return metadata
def scrape_website(self, url, data_types, max_pages=1, rate_limit=2):
"""Main scraping function"""
scraped_data = {
"url": url,
"timestamp": datetime.now().isoformat(),
"data_types": data_types,
"pages_crawled": 0,
"errors": []
}
try:
# Setup Selenium if needed for dynamic content
use_selenium = "images" in data_types or "tables" in data_types
if use_selenium:
if not self.setup_selenium():
scraped_data["errors"].append("Failed to setup Selenium for dynamic content")
# Get page content
content = self.get_page_content(url, use_selenium)
if not content:
scraped_data["errors"].append("Failed to fetch page content")
return scraped_data
# Parse with BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
scraped_data["pages_crawled"] = 1
# Extract data based on selected types
if "text" in data_types:
scraped_data["text_content"] = self.extract_text_content(soup)
if "images" in data_types:
scraped_data["images"] = self.extract_images(soup, url)
if "links" in data_types:
scraped_data["links"] = self.extract_links(soup, url)
if "tables" in data_types:
scraped_data["tables"] = self.extract_tables(soup)
if "metadata" in data_types:
scraped_data["metadata"] = self.extract_metadata(soup)
if "numbers" in data_types:
scraped_data["numbers"] = self.extract_numbers(soup)
# Rate limiting
time.sleep(rate_limit)
except Exception as e:
scraped_data["errors"].append(f"Scraping error: {str(e)}")
finally:
# Clean up Selenium
if use_selenium:
self.close_selenium()
return scraped_data
# Global scraper instance
scraper = WebScraper()