Spaces:

fguryel
/

scikit-rag

Sleeping

App Files Files Community

scikit-rag / scraper.py

fguryel

init

9222df3 5 months ago

raw

history blame contribute delete

11.1 kB

	#!/usr/bin/env python3
	"""
	Scikit-learn Documentation Scraper

	This module scrapes the Scikit-learn User Guide documentation and saves
	the content to a JSON file for use in a RAG application.

	Author: AI Assistant
	Date: September 2025
	"""

	import json
	import re
	import time
	from typing import Dict, List, Optional
	from urllib.parse import urljoin, urlparse

	import requests
	from bs4 import BeautifulSoup


	class ScikitLearnScraper:
	"""
	A web scraper for extracting content from Scikit-learn documentation.

	This class handles the extraction of text content from the Scikit-learn
	User Guide pages, with proper error handling and content cleaning.
	"""

	def __init__(self, base_url: str = "https://scikit-learn.org/stable/user_guide.html"):
	"""
	Initialize the scraper with the base URL.

	Args:
	base_url (str): The main User Guide URL to start scraping from
	"""
	self.base_url = base_url
	self.base_domain = "https://scikit-learn.org"
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Scikit-learn Documentation Scraper)'
	})

	def get_page_content(self, url: str, timeout: int = 10) -> Optional[BeautifulSoup]:
	"""
	Fetch and parse a web page.

	Args:
	url (str): URL to fetch
	timeout (int): Request timeout in seconds

	Returns:
	BeautifulSoup: Parsed HTML content or None if failed
	"""
	try:
	print(f"Fetching: {url}")
	response = self.session.get(url, timeout=timeout)
	response.raise_for_status()

	return BeautifulSoup(response.content, 'html.parser')

	except requests.exceptions.RequestException as e:
	print(f"Error fetching {url}: {e}")
	return None
	except Exception as e:
	print(f"Unexpected error parsing {url}: {e}")
	return None

	def extract_user_guide_links(self, soup: BeautifulSoup) -> List[str]:
	"""
	Extract all user guide links from the main page.

	Args:
	soup (BeautifulSoup): Parsed HTML of the main user guide page

	Returns:
	List[str]: List of absolute URLs to user guide sections
	"""
	links = []

	# Look for the main content area and find all links
	# Scikit-learn user guide has links in the main content area
	main_content = soup.find('div', class_='body') or soup.find('main') or soup

	if main_content:
	# Find all links that are part of the user guide
	for link in main_content.find_all('a', href=True):
	href = link.get('href')

	# Filter for user guide links (typically contain specific patterns)
	if href and (
	href.startswith('modules/') or
	href.startswith('supervised_learning') or
	href.startswith('unsupervised_learning') or
	href.startswith('model_selection') or
	href.startswith('data_transforms') or
	href.startswith('datasets/') or
	href.startswith('computing/') or
	href.startswith('model_persistence') or
	'user_guide' in href
	):
	# Convert to absolute URL
	absolute_url = urljoin(self.base_domain + '/stable/', href)

	# Ensure it's an HTML page
	if absolute_url.endswith('.html') or not '.' in absolute_url.split('/')[-1]:
	links.append(absolute_url)

	# Also look for table of contents or navigation menus
	toc_sections = soup.find_all(['div', 'nav'], class_=['toctree-wrapper', 'navigation', 'sidebar'])
	for section in toc_sections:
	for link in section.find_all('a', href=True):
	href = link.get('href')
	if href and not href.startswith('#') and not href.startswith('http'):
	absolute_url = urljoin(self.base_domain + '/stable/', href)
	if absolute_url.endswith('.html') and 'user_guide' in absolute_url:
	links.append(absolute_url)

	# Remove duplicates and sort
	unique_links = list(set(links))
	unique_links.sort()

	print(f"Found {len(unique_links)} user guide links")
	return unique_links

	def clean_text(self, text: str) -> str:
	"""
	Clean and normalize extracted text content.

	Args:
	text (str): Raw text content

	Returns:
	str: Cleaned text content
	"""
	# Remove excessive whitespace and newlines
	text = re.sub(r'\n\s\n\s\n+', '\n\n', text)
	text = re.sub(r'[ \t]+', ' ', text)

	# Remove leading/trailing whitespace from each line
	lines = [line.strip() for line in text.split('\n')]
	text = '\n'.join(lines)

	# Remove excessive blank lines at start/end
	text = text.strip()

	return text

	def extract_main_content(self, soup: BeautifulSoup) -> str:
	"""
	Extract the main text content from a documentation page.

	Args:
	soup (BeautifulSoup): Parsed HTML of the page

	Returns:
	str: Cleaned main text content
	"""
	# Remove script and style elements
	for element in soup(['script', 'style', 'nav', 'header', 'footer']):
	element.decompose()

	# Try to find the main content area (common selectors for documentation sites)
	main_content = None
	content_selectors = [
	'div.body',
	'div.document',
	'main',
	'div.content',
	'div.main-content',
	'article',
	'div.rst-content'
	]

	for selector in content_selectors:
	main_content = soup.select_one(selector)
	if main_content:
	break

	# If no main content found, use the whole body
	if not main_content:
	main_content = soup.find('body') or soup

	# Remove sidebar and navigation elements
	for element in main_content.find_all(['aside', 'nav']):
	element.decompose()

	# Remove elements with specific classes that are typically navigation/sidebar
	remove_classes = [
	'sidebar', 'navigation', 'toctree', 'breadcrumb',
	'headerlink', 'viewcode-link', 'edit-on-github'
	]

	for class_name in remove_classes:
	for element in main_content.find_all(class_=class_name):
	element.decompose()

	# Extract text content
	text = main_content.get_text(separator='\n', strip=True)

	return self.clean_text(text)

	def scrape_page(self, url: str) -> Optional[Dict[str, str]]:
	"""
	Scrape a single page and return its content.

	Args:
	url (str): URL to scrape

	Returns:
	Dict[str, str]: Dictionary with 'url' and 'text' keys, or None if failed
	"""
	print(f"Scraping page: {url}")

	soup = self.get_page_content(url)
	if not soup:
	return None

	text_content = self.extract_main_content(soup)

	if len(text_content.strip()) < 100: # Skip pages with minimal content
	print(f"Skipping page with minimal content: {url}")
	return None

	return {
	"url": url,
	"text": text_content
	}

	def scrape_all(self, delay: float = 1.0) -> List[Dict[str, str]]:
	"""
	Scrape all user guide pages and return the content.

	Args:
	delay (float): Delay between requests in seconds

	Returns:
	List[Dict[str, str]]: List of dictionaries with scraped content
	"""
	print("Starting Scikit-learn documentation scraping...")

	# Get the main user guide page
	main_soup = self.get_page_content(self.base_url)
	if not main_soup:
	print("Failed to fetch main user guide page")
	return []

	# Extract all user guide links
	links = self.extract_user_guide_links(main_soup)

	# Add the main page itself
	all_links = [self.base_url] + links

	scraped_content = []

	for i, url in enumerate(all_links, 1):
	try:
	content = self.scrape_page(url)
	if content:
	scraped_content.append(content)
	print(f"Successfully scraped {i}/{len(all_links)}: {url}")
	else:
	print(f"Failed to scrape {i}/{len(all_links)}: {url}")

	# Add delay to be respectful to the server
	if i < len(all_links):
	time.sleep(delay)

	except KeyboardInterrupt:
	print("\nScraping interrupted by user")
	break
	except Exception as e:
	print(f"Unexpected error scraping {url}: {e}")
	continue

	print(f"\nScraping completed! Total pages scraped: {len(scraped_content)}")
	return scraped_content

	def save_to_json(self, content: List[Dict[str, str]], filename: str = "scraped_content.json"):
	"""
	Save scraped content to a JSON file.

	Args:
	content (List[Dict[str, str]]): Scraped content
	filename (str): Output filename
	"""
	try:
	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(content, f, indent=2, ensure_ascii=False)
	print(f"Content saved to {filename}")
	except Exception as e:
	print(f"Error saving to {filename}: {e}")


	def main():
	"""
	Main function to run the scraper.
	"""
	print("Scikit-learn Documentation Scraper")
	print("=" * 50)

	# Initialize scraper
	scraper = ScikitLearnScraper()

	# Scrape all content
	content = scraper.scrape_all(delay=1.0)

	if content:
	# Save to JSON file
	scraper.save_to_json(content)

	# Print summary
	total_chars = sum(len(item['text']) for item in content)
	print(f"\nSummary:")
	print(f"- Pages scraped: {len(content)}")
	print(f"- Total characters: {total_chars:,}")
	print(f"- Average characters per page: {total_chars // len(content):,}")
	else:
	print("No content was scraped successfully.")


	if __name__ == "__main__":
	main()