#!/usr/bin/env python3 """ Scikit-learn Documentation Scraper This module scrapes the Scikit-learn User Guide documentation and saves the content to a JSON file for use in a RAG application. Author: AI Assistant Date: September 2025 """ import json import re import time from typing import Dict, List, Optional from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup class ScikitLearnScraper: """ A web scraper for extracting content from Scikit-learn documentation. This class handles the extraction of text content from the Scikit-learn User Guide pages, with proper error handling and content cleaning. """ def __init__(self, base_url: str = "https://scikit-learn.org/stable/user_guide.html"): """ Initialize the scraper with the base URL. Args: base_url (str): The main User Guide URL to start scraping from """ self.base_url = base_url self.base_domain = "https://scikit-learn.org" self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Scikit-learn Documentation Scraper)' }) def get_page_content(self, url: str, timeout: int = 10) -> Optional[BeautifulSoup]: """ Fetch and parse a web page. Args: url (str): URL to fetch timeout (int): Request timeout in seconds Returns: BeautifulSoup: Parsed HTML content or None if failed """ try: print(f"Fetching: {url}") response = self.session.get(url, timeout=timeout) response.raise_for_status() return BeautifulSoup(response.content, 'html.parser') except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") return None except Exception as e: print(f"Unexpected error parsing {url}: {e}") return None def extract_user_guide_links(self, soup: BeautifulSoup) -> List[str]: """ Extract all user guide links from the main page. Args: soup (BeautifulSoup): Parsed HTML of the main user guide page Returns: List[str]: List of absolute URLs to user guide sections """ links = [] # Look for the main content area and find all links # Scikit-learn user guide has links in the main content area main_content = soup.find('div', class_='body') or soup.find('main') or soup if main_content: # Find all links that are part of the user guide for link in main_content.find_all('a', href=True): href = link.get('href') # Filter for user guide links (typically contain specific patterns) if href and ( href.startswith('modules/') or href.startswith('supervised_learning') or href.startswith('unsupervised_learning') or href.startswith('model_selection') or href.startswith('data_transforms') or href.startswith('datasets/') or href.startswith('computing/') or href.startswith('model_persistence') or 'user_guide' in href ): # Convert to absolute URL absolute_url = urljoin(self.base_domain + '/stable/', href) # Ensure it's an HTML page if absolute_url.endswith('.html') or not '.' in absolute_url.split('/')[-1]: links.append(absolute_url) # Also look for table of contents or navigation menus toc_sections = soup.find_all(['div', 'nav'], class_=['toctree-wrapper', 'navigation', 'sidebar']) for section in toc_sections: for link in section.find_all('a', href=True): href = link.get('href') if href and not href.startswith('#') and not href.startswith('http'): absolute_url = urljoin(self.base_domain + '/stable/', href) if absolute_url.endswith('.html') and 'user_guide' in absolute_url: links.append(absolute_url) # Remove duplicates and sort unique_links = list(set(links)) unique_links.sort() print(f"Found {len(unique_links)} user guide links") return unique_links def clean_text(self, text: str) -> str: """ Clean and normalize extracted text content. Args: text (str): Raw text content Returns: str: Cleaned text content """ # Remove excessive whitespace and newlines text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) text = re.sub(r'[ \t]+', ' ', text) # Remove leading/trailing whitespace from each line lines = [line.strip() for line in text.split('\n')] text = '\n'.join(lines) # Remove excessive blank lines at start/end text = text.strip() return text def extract_main_content(self, soup: BeautifulSoup) -> str: """ Extract the main text content from a documentation page. Args: soup (BeautifulSoup): Parsed HTML of the page Returns: str: Cleaned main text content """ # Remove script and style elements for element in soup(['script', 'style', 'nav', 'header', 'footer']): element.decompose() # Try to find the main content area (common selectors for documentation sites) main_content = None content_selectors = [ 'div.body', 'div.document', 'main', 'div.content', 'div.main-content', 'article', 'div.rst-content' ] for selector in content_selectors: main_content = soup.select_one(selector) if main_content: break # If no main content found, use the whole body if not main_content: main_content = soup.find('body') or soup # Remove sidebar and navigation elements for element in main_content.find_all(['aside', 'nav']): element.decompose() # Remove elements with specific classes that are typically navigation/sidebar remove_classes = [ 'sidebar', 'navigation', 'toctree', 'breadcrumb', 'headerlink', 'viewcode-link', 'edit-on-github' ] for class_name in remove_classes: for element in main_content.find_all(class_=class_name): element.decompose() # Extract text content text = main_content.get_text(separator='\n', strip=True) return self.clean_text(text) def scrape_page(self, url: str) -> Optional[Dict[str, str]]: """ Scrape a single page and return its content. Args: url (str): URL to scrape Returns: Dict[str, str]: Dictionary with 'url' and 'text' keys, or None if failed """ print(f"Scraping page: {url}") soup = self.get_page_content(url) if not soup: return None text_content = self.extract_main_content(soup) if len(text_content.strip()) < 100: # Skip pages with minimal content print(f"Skipping page with minimal content: {url}") return None return { "url": url, "text": text_content } def scrape_all(self, delay: float = 1.0) -> List[Dict[str, str]]: """ Scrape all user guide pages and return the content. Args: delay (float): Delay between requests in seconds Returns: List[Dict[str, str]]: List of dictionaries with scraped content """ print("Starting Scikit-learn documentation scraping...") # Get the main user guide page main_soup = self.get_page_content(self.base_url) if not main_soup: print("Failed to fetch main user guide page") return [] # Extract all user guide links links = self.extract_user_guide_links(main_soup) # Add the main page itself all_links = [self.base_url] + links scraped_content = [] for i, url in enumerate(all_links, 1): try: content = self.scrape_page(url) if content: scraped_content.append(content) print(f"Successfully scraped {i}/{len(all_links)}: {url}") else: print(f"Failed to scrape {i}/{len(all_links)}: {url}") # Add delay to be respectful to the server if i < len(all_links): time.sleep(delay) except KeyboardInterrupt: print("\nScraping interrupted by user") break except Exception as e: print(f"Unexpected error scraping {url}: {e}") continue print(f"\nScraping completed! Total pages scraped: {len(scraped_content)}") return scraped_content def save_to_json(self, content: List[Dict[str, str]], filename: str = "scraped_content.json"): """ Save scraped content to a JSON file. Args: content (List[Dict[str, str]]): Scraped content filename (str): Output filename """ try: with open(filename, 'w', encoding='utf-8') as f: json.dump(content, f, indent=2, ensure_ascii=False) print(f"Content saved to {filename}") except Exception as e: print(f"Error saving to {filename}: {e}") def main(): """ Main function to run the scraper. """ print("Scikit-learn Documentation Scraper") print("=" * 50) # Initialize scraper scraper = ScikitLearnScraper() # Scrape all content content = scraper.scrape_all(delay=1.0) if content: # Save to JSON file scraper.save_to_json(content) # Print summary total_chars = sum(len(item['text']) for item in content) print(f"\nSummary:") print(f"- Pages scraped: {len(content)}") print(f"- Total characters: {total_chars:,}") print(f"- Average characters per page: {total_chars // len(content):,}") else: print("No content was scraped successfully.") if __name__ == "__main__": main()