Spaces:

fguryel
/

scikit-rag

Sleeping

File size: 11,093 Bytes

9222df3

#!/usr/bin/env python3
"""
Scikit-learn Documentation Scraper

This module scrapes the Scikit-learn User Guide documentation and saves
the content to a JSON file for use in a RAG application.

Author: AI Assistant
Date: September 2025
"""

import json
import re
import time
from typing import Dict, List, Optional
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup


class ScikitLearnScraper:
    """
    A web scraper for extracting content from Scikit-learn documentation.
    
    This class handles the extraction of text content from the Scikit-learn
    User Guide pages, with proper error handling and content cleaning.
    """
    
    def __init__(self, base_url: str = "https://scikit-learn.org/stable/user_guide.html"):
        """
        Initialize the scraper with the base URL.
        
        Args:
            base_url (str): The main User Guide URL to start scraping from
        """
        self.base_url = base_url
        self.base_domain = "https://scikit-learn.org"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Scikit-learn Documentation Scraper)'
        })
        
    def get_page_content(self, url: str, timeout: int = 10) -> Optional[BeautifulSoup]:
        """
        Fetch and parse a web page.
        
        Args:
            url (str): URL to fetch
            timeout (int): Request timeout in seconds
            
        Returns:
            BeautifulSoup: Parsed HTML content or None if failed
        """
        try:
            print(f"Fetching: {url}")
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()
            
            return BeautifulSoup(response.content, 'html.parser')
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error parsing {url}: {e}")
            return None
    
    def extract_user_guide_links(self, soup: BeautifulSoup) -> List[str]:
        """
        Extract all user guide links from the main page.
        
        Args:
            soup (BeautifulSoup): Parsed HTML of the main user guide page
            
        Returns:
            List[str]: List of absolute URLs to user guide sections
        """
        links = []
        
        # Look for the main content area and find all links
        # Scikit-learn user guide has links in the main content area
        main_content = soup.find('div', class_='body') or soup.find('main') or soup
        
        if main_content:
            # Find all links that are part of the user guide
            for link in main_content.find_all('a', href=True):
                href = link.get('href')
                
                # Filter for user guide links (typically contain specific patterns)
                if href and (
                    href.startswith('modules/') or 
                    href.startswith('supervised_learning') or
                    href.startswith('unsupervised_learning') or
                    href.startswith('model_selection') or
                    href.startswith('data_transforms') or
                    href.startswith('datasets/') or
                    href.startswith('computing/') or
                    href.startswith('model_persistence') or
                    'user_guide' in href
                ):
                    # Convert to absolute URL
                    absolute_url = urljoin(self.base_domain + '/stable/', href)
                    
                    # Ensure it's an HTML page
                    if absolute_url.endswith('.html') or not '.' in absolute_url.split('/')[-1]:
                        links.append(absolute_url)
        
        # Also look for table of contents or navigation menus
        toc_sections = soup.find_all(['div', 'nav'], class_=['toctree-wrapper', 'navigation', 'sidebar'])
        for section in toc_sections:
            for link in section.find_all('a', href=True):
                href = link.get('href')
                if href and not href.startswith('#') and not href.startswith('http'):
                    absolute_url = urljoin(self.base_domain + '/stable/', href)
                    if absolute_url.endswith('.html') and 'user_guide' in absolute_url:
                        links.append(absolute_url)
        
        # Remove duplicates and sort
        unique_links = list(set(links))
        unique_links.sort()
        
        print(f"Found {len(unique_links)} user guide links")
        return unique_links
    
    def clean_text(self, text: str) -> str:
        """
        Clean and normalize extracted text content.
        
        Args:
            text (str): Raw text content
            
        Returns:
            str: Cleaned text content
        """
        # Remove excessive whitespace and newlines
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        text = re.sub(r'[ \t]+', ' ', text)
        
        # Remove leading/trailing whitespace from each line
        lines = [line.strip() for line in text.split('\n')]
        text = '\n'.join(lines)
        
        # Remove excessive blank lines at start/end
        text = text.strip()
        
        return text
    
    def extract_main_content(self, soup: BeautifulSoup) -> str:
        """
        Extract the main text content from a documentation page.
        
        Args:
            soup (BeautifulSoup): Parsed HTML of the page
            
        Returns:
            str: Cleaned main text content
        """
        # Remove script and style elements
        for element in soup(['script', 'style', 'nav', 'header', 'footer']):
            element.decompose()
        
        # Try to find the main content area (common selectors for documentation sites)
        main_content = None
        content_selectors = [
            'div.body',
            'div.document',
            'main',
            'div.content',
            'div.main-content',
            'article',
            'div.rst-content'
        ]
        
        for selector in content_selectors:
            main_content = soup.select_one(selector)
            if main_content:
                break
        
        # If no main content found, use the whole body
        if not main_content:
            main_content = soup.find('body') or soup
        
        # Remove sidebar and navigation elements
        for element in main_content.find_all(['aside', 'nav']):
            element.decompose()
        
        # Remove elements with specific classes that are typically navigation/sidebar
        remove_classes = [
            'sidebar', 'navigation', 'toctree', 'breadcrumb',
            'headerlink', 'viewcode-link', 'edit-on-github'
        ]
        
        for class_name in remove_classes:
            for element in main_content.find_all(class_=class_name):
                element.decompose()
        
        # Extract text content
        text = main_content.get_text(separator='\n', strip=True)
        
        return self.clean_text(text)
    
    def scrape_page(self, url: str) -> Optional[Dict[str, str]]:
        """
        Scrape a single page and return its content.
        
        Args:
            url (str): URL to scrape
            
        Returns:
            Dict[str, str]: Dictionary with 'url' and 'text' keys, or None if failed
        """
        print(f"Scraping page: {url}")
        
        soup = self.get_page_content(url)
        if not soup:
            return None
        
        text_content = self.extract_main_content(soup)
        
        if len(text_content.strip()) < 100:  # Skip pages with minimal content
            print(f"Skipping page with minimal content: {url}")
            return None
        
        return {
            "url": url,
            "text": text_content
        }
    
    def scrape_all(self, delay: float = 1.0) -> List[Dict[str, str]]:
        """
        Scrape all user guide pages and return the content.
        
        Args:
            delay (float): Delay between requests in seconds
            
        Returns:
            List[Dict[str, str]]: List of dictionaries with scraped content
        """
        print("Starting Scikit-learn documentation scraping...")
        
        # Get the main user guide page
        main_soup = self.get_page_content(self.base_url)
        if not main_soup:
            print("Failed to fetch main user guide page")
            return []
        
        # Extract all user guide links
        links = self.extract_user_guide_links(main_soup)
        
        # Add the main page itself
        all_links = [self.base_url] + links
        
        scraped_content = []
        
        for i, url in enumerate(all_links, 1):
            try:
                content = self.scrape_page(url)
                if content:
                    scraped_content.append(content)
                    print(f"Successfully scraped {i}/{len(all_links)}: {url}")
                else:
                    print(f"Failed to scrape {i}/{len(all_links)}: {url}")
                
                # Add delay to be respectful to the server
                if i < len(all_links):
                    time.sleep(delay)
                    
            except KeyboardInterrupt:
                print("\nScraping interrupted by user")
                break
            except Exception as e:
                print(f"Unexpected error scraping {url}: {e}")
                continue
        
        print(f"\nScraping completed! Total pages scraped: {len(scraped_content)}")
        return scraped_content
    
    def save_to_json(self, content: List[Dict[str, str]], filename: str = "scraped_content.json"):
        """
        Save scraped content to a JSON file.
        
        Args:
            content (List[Dict[str, str]]): Scraped content
            filename (str): Output filename
        """
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(content, f, indent=2, ensure_ascii=False)
            print(f"Content saved to {filename}")
        except Exception as e:
            print(f"Error saving to {filename}: {e}")


def main():
    """
    Main function to run the scraper.
    """
    print("Scikit-learn Documentation Scraper")
    print("=" * 50)
    
    # Initialize scraper
    scraper = ScikitLearnScraper()
    
    # Scrape all content
    content = scraper.scrape_all(delay=1.0)
    
    if content:
        # Save to JSON file
        scraper.save_to_json(content)
        
        # Print summary
        total_chars = sum(len(item['text']) for item in content)
        print(f"\nSummary:")
        print(f"- Pages scraped: {len(content)}")
        print(f"- Total characters: {total_chars:,}")
        print(f"- Average characters per page: {total_chars // len(content):,}")
    else:
        print("No content was scraped successfully.")


if __name__ == "__main__":
    main()