Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| import time | |
| import re | |
| from urllib.parse import urljoin, urlparse | |
| import json | |
| from datetime import datetime | |
| class WebScraper: | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| }) | |
| self.driver = None | |
| def setup_selenium(self): | |
| """Setup Selenium WebDriver for dynamic content""" | |
| try: | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") | |
| chrome_options.add_argument("--no-sandbox") | |
| chrome_options.add_argument("--disable-dev-shm-usage") | |
| chrome_options.add_argument("--disable-gpu") | |
| chrome_options.add_argument("--window-size=1920,1080") | |
| self.driver = webdriver.Chrome( | |
| service=webdriver.chrome.service.Service(ChromeDriverManager().install()), | |
| options=chrome_options | |
| ) | |
| return True | |
| except Exception as e: | |
| print(f"Failed to setup Selenium: {e}") | |
| return False | |
| def close_selenium(self): | |
| """Close Selenium WebDriver""" | |
| if self.driver: | |
| self.driver.quit() | |
| self.driver = None | |
| def get_page_content(self, url, use_selenium=False): | |
| """Get page content using requests or Selenium""" | |
| try: | |
| if use_selenium and self.driver: | |
| self.driver.get(url) | |
| time.sleep(2) # Wait for dynamic content | |
| return self.driver.page_source | |
| else: | |
| response = self.session.get(url, timeout=10) | |
| response.raise_for_status() | |
| return response.text | |
| except Exception as e: | |
| print(f"Error fetching page: {e}") | |
| return None | |
| def extract_text_content(self, soup): | |
| """Extract text content from BeautifulSoup object""" | |
| text_data = { | |
| "title": "", | |
| "headings": [], | |
| "paragraphs": [], | |
| "lists": [] | |
| } | |
| # Extract title | |
| title_tag = soup.find('title') | |
| if title_tag: | |
| text_data["title"] = title_tag.get_text().strip() | |
| # Extract headings | |
| for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: | |
| headings = soup.find_all(tag) | |
| for heading in headings: | |
| text = heading.get_text().strip() | |
| if text: | |
| text_data["headings"].append({ | |
| "level": tag, | |
| "text": text | |
| }) | |
| # Extract paragraphs | |
| paragraphs = soup.find_all('p') | |
| for p in paragraphs: | |
| text = p.get_text().strip() | |
| if text and len(text) > 20: # Filter out short text | |
| text_data["paragraphs"].append(text) | |
| # Extract lists | |
| lists = soup.find_all(['ul', 'ol']) | |
| for lst in lists: | |
| items = [] | |
| for item in lst.find_all('li'): | |
| text = item.get_text().strip() | |
| if text: | |
| items.append(text) | |
| if items: | |
| text_data["lists"].append({ | |
| "type": lst.name, | |
| "items": items | |
| }) | |
| return text_data | |
| def extract_numbers(self, soup): | |
| """Extract all numbers (integers and floats) from the text content""" | |
| text = soup.get_text() | |
| # Regex to find integers and floats | |
| numbers = re.findall(r'\b\d+\.?\d*\b', text) | |
| # Convert to float for consistency, and remove duplicates | |
| return sorted(list(set([float(n) for n in numbers if n.strip()]))) | |
| def extract_images(self, soup, base_url): | |
| """Extract images from BeautifulSoup object""" | |
| images = [] | |
| img_tags = soup.find_all('img') | |
| for img in img_tags: | |
| src = img.get('src', '') | |
| alt = img.get('alt', '') | |
| title = img.get('title', '') | |
| if src: | |
| # Make relative URLs absolute | |
| if not src.startswith(('http://', 'https://')): | |
| src = urljoin(base_url, src) | |
| images.append({ | |
| "src": src, | |
| "alt": alt, | |
| "title": title, | |
| "width": img.get('width', ''), | |
| "height": img.get('height', '') | |
| }) | |
| return images | |
| def extract_links(self, soup, base_url): | |
| """Extract links from BeautifulSoup object""" | |
| links = [] | |
| link_tags = soup.find_all('a', href=True) | |
| for link in link_tags: | |
| href = link.get('href') | |
| text = link.get_text().strip() | |
| if href and text: | |
| # Make relative URLs absolute | |
| if not href.startswith(('http://', 'https://')): | |
| href = urljoin(base_url, href) | |
| # Only include external and internal links, skip anchors | |
| if not href.startswith('#'): | |
| links.append({ | |
| "href": href, | |
| "text": text, | |
| "title": link.get('title', ''), | |
| "is_external": not href.startswith(base_url) | |
| }) | |
| return links | |
| def extract_tables(self, soup): | |
| """Extract tables from BeautifulSoup object""" | |
| tables = [] | |
| table_tags = soup.find_all('table') | |
| for table in table_tags: | |
| table_data = { | |
| "headers": [], | |
| "rows": [], | |
| "caption": "" | |
| } | |
| # Extract caption | |
| caption = table.find('caption') | |
| if caption: | |
| table_data["caption"] = caption.get_text().strip() | |
| # Extract headers | |
| thead = table.find('thead') | |
| if thead: | |
| header_row = thead.find('tr') | |
| if header_row: | |
| headers = header_row.find_all(['th', 'td']) | |
| table_data["headers"] = [h.get_text().strip() for h in headers] | |
| # Extract rows | |
| tbody = table.find('tbody') or table | |
| rows = tbody.find_all('tr') | |
| for row in rows: | |
| cells = row.find_all(['td', 'th']) | |
| if cells: | |
| row_data = [cell.get_text().strip() for cell in cells] | |
| table_data["rows"].append(row_data) | |
| if table_data["rows"]: | |
| tables.append(table_data) | |
| return tables | |
| def extract_metadata(self, soup): | |
| """Extract metadata from BeautifulSoup object""" | |
| metadata = { | |
| "title": "", | |
| "description": "", | |
| "keywords": [], | |
| "author": "", | |
| "language": "en", | |
| "robots": "", | |
| "viewport": "", | |
| "charset": "" | |
| } | |
| # Extract title | |
| title_tag = soup.find('title') | |
| if title_tag: | |
| metadata["title"] = title_tag.get_text().strip() | |
| # Extract meta tags | |
| meta_tags = soup.find_all('meta') | |
| for meta in meta_tags: | |
| name = meta.get('name', '').lower() | |
| content = meta.get('content', '') | |
| property_attr = meta.get('property', '').lower() | |
| if name == 'description' or property_attr == 'og:description': | |
| metadata["description"] = content | |
| elif name == 'keywords': | |
| metadata["keywords"] = [kw.strip() for kw in content.split(',')] | |
| elif name == 'author': | |
| metadata["author"] = content | |
| elif name == 'robots': | |
| metadata["robots"] = content | |
| elif name == 'viewport': | |
| metadata["viewport"] = content | |
| elif property_attr == 'og:title': | |
| metadata["title"] = content or metadata["title"] | |
| # Extract charset | |
| charset_meta = soup.find('meta', charset=True) | |
| if charset_meta: | |
| metadata["charset"] = charset_meta.get('charset') | |
| # Extract language | |
| html_tag = soup.find('html') | |
| if html_tag: | |
| lang = html_tag.get('lang', 'en') | |
| metadata["language"] = lang | |
| return metadata | |
| def scrape_website(self, url, data_types, max_pages=1, rate_limit=2): | |
| """Main scraping function""" | |
| scraped_data = { | |
| "url": url, | |
| "timestamp": datetime.now().isoformat(), | |
| "data_types": data_types, | |
| "pages_crawled": 0, | |
| "errors": [] | |
| } | |
| try: | |
| # Setup Selenium if needed for dynamic content | |
| use_selenium = "images" in data_types or "tables" in data_types | |
| if use_selenium: | |
| if not self.setup_selenium(): | |
| scraped_data["errors"].append("Failed to setup Selenium for dynamic content") | |
| # Get page content | |
| content = self.get_page_content(url, use_selenium) | |
| if not content: | |
| scraped_data["errors"].append("Failed to fetch page content") | |
| return scraped_data | |
| # Parse with BeautifulSoup | |
| soup = BeautifulSoup(content, 'html.parser') | |
| scraped_data["pages_crawled"] = 1 | |
| # Extract data based on selected types | |
| if "text" in data_types: | |
| scraped_data["text_content"] = self.extract_text_content(soup) | |
| if "images" in data_types: | |
| scraped_data["images"] = self.extract_images(soup, url) | |
| if "links" in data_types: | |
| scraped_data["links"] = self.extract_links(soup, url) | |
| if "tables" in data_types: | |
| scraped_data["tables"] = self.extract_tables(soup) | |
| if "metadata" in data_types: | |
| scraped_data["metadata"] = self.extract_metadata(soup) | |
| if "numbers" in data_types: | |
| scraped_data["numbers"] = self.extract_numbers(soup) | |
| # Rate limiting | |
| time.sleep(rate_limit) | |
| except Exception as e: | |
| scraped_data["errors"].append(f"Scraping error: {str(e)}") | |
| finally: | |
| # Clean up Selenium | |
| if use_selenium: | |
| self.close_selenium() | |
| return scraped_data | |
| # Global scraper instance | |
| scraper = WebScraper() |