Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Scikit-learn Documentation Scraper | |
| This module scrapes the Scikit-learn User Guide documentation and saves | |
| the content to a JSON file for use in a RAG application. | |
| Author: AI Assistant | |
| Date: September 2025 | |
| """ | |
| import json | |
| import re | |
| import time | |
| from typing import Dict, List, Optional | |
| from urllib.parse import urljoin, urlparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| class ScikitLearnScraper: | |
| """ | |
| A web scraper for extracting content from Scikit-learn documentation. | |
| This class handles the extraction of text content from the Scikit-learn | |
| User Guide pages, with proper error handling and content cleaning. | |
| """ | |
| def __init__(self, base_url: str = "https://scikit-learn.org/stable/user_guide.html"): | |
| """ | |
| Initialize the scraper with the base URL. | |
| Args: | |
| base_url (str): The main User Guide URL to start scraping from | |
| """ | |
| self.base_url = base_url | |
| self.base_domain = "https://scikit-learn.org" | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Scikit-learn Documentation Scraper)' | |
| }) | |
| def get_page_content(self, url: str, timeout: int = 10) -> Optional[BeautifulSoup]: | |
| """ | |
| Fetch and parse a web page. | |
| Args: | |
| url (str): URL to fetch | |
| timeout (int): Request timeout in seconds | |
| Returns: | |
| BeautifulSoup: Parsed HTML content or None if failed | |
| """ | |
| try: | |
| print(f"Fetching: {url}") | |
| response = self.session.get(url, timeout=timeout) | |
| response.raise_for_status() | |
| return BeautifulSoup(response.content, 'html.parser') | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching {url}: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"Unexpected error parsing {url}: {e}") | |
| return None | |
| def extract_user_guide_links(self, soup: BeautifulSoup) -> List[str]: | |
| """ | |
| Extract all user guide links from the main page. | |
| Args: | |
| soup (BeautifulSoup): Parsed HTML of the main user guide page | |
| Returns: | |
| List[str]: List of absolute URLs to user guide sections | |
| """ | |
| links = [] | |
| # Look for the main content area and find all links | |
| # Scikit-learn user guide has links in the main content area | |
| main_content = soup.find('div', class_='body') or soup.find('main') or soup | |
| if main_content: | |
| # Find all links that are part of the user guide | |
| for link in main_content.find_all('a', href=True): | |
| href = link.get('href') | |
| # Filter for user guide links (typically contain specific patterns) | |
| if href and ( | |
| href.startswith('modules/') or | |
| href.startswith('supervised_learning') or | |
| href.startswith('unsupervised_learning') or | |
| href.startswith('model_selection') or | |
| href.startswith('data_transforms') or | |
| href.startswith('datasets/') or | |
| href.startswith('computing/') or | |
| href.startswith('model_persistence') or | |
| 'user_guide' in href | |
| ): | |
| # Convert to absolute URL | |
| absolute_url = urljoin(self.base_domain + '/stable/', href) | |
| # Ensure it's an HTML page | |
| if absolute_url.endswith('.html') or not '.' in absolute_url.split('/')[-1]: | |
| links.append(absolute_url) | |
| # Also look for table of contents or navigation menus | |
| toc_sections = soup.find_all(['div', 'nav'], class_=['toctree-wrapper', 'navigation', 'sidebar']) | |
| for section in toc_sections: | |
| for link in section.find_all('a', href=True): | |
| href = link.get('href') | |
| if href and not href.startswith('#') and not href.startswith('http'): | |
| absolute_url = urljoin(self.base_domain + '/stable/', href) | |
| if absolute_url.endswith('.html') and 'user_guide' in absolute_url: | |
| links.append(absolute_url) | |
| # Remove duplicates and sort | |
| unique_links = list(set(links)) | |
| unique_links.sort() | |
| print(f"Found {len(unique_links)} user guide links") | |
| return unique_links | |
| def clean_text(self, text: str) -> str: | |
| """ | |
| Clean and normalize extracted text content. | |
| Args: | |
| text (str): Raw text content | |
| Returns: | |
| str: Cleaned text content | |
| """ | |
| # Remove excessive whitespace and newlines | |
| text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) | |
| text = re.sub(r'[ \t]+', ' ', text) | |
| # Remove leading/trailing whitespace from each line | |
| lines = [line.strip() for line in text.split('\n')] | |
| text = '\n'.join(lines) | |
| # Remove excessive blank lines at start/end | |
| text = text.strip() | |
| return text | |
| def extract_main_content(self, soup: BeautifulSoup) -> str: | |
| """ | |
| Extract the main text content from a documentation page. | |
| Args: | |
| soup (BeautifulSoup): Parsed HTML of the page | |
| Returns: | |
| str: Cleaned main text content | |
| """ | |
| # Remove script and style elements | |
| for element in soup(['script', 'style', 'nav', 'header', 'footer']): | |
| element.decompose() | |
| # Try to find the main content area (common selectors for documentation sites) | |
| main_content = None | |
| content_selectors = [ | |
| 'div.body', | |
| 'div.document', | |
| 'main', | |
| 'div.content', | |
| 'div.main-content', | |
| 'article', | |
| 'div.rst-content' | |
| ] | |
| for selector in content_selectors: | |
| main_content = soup.select_one(selector) | |
| if main_content: | |
| break | |
| # If no main content found, use the whole body | |
| if not main_content: | |
| main_content = soup.find('body') or soup | |
| # Remove sidebar and navigation elements | |
| for element in main_content.find_all(['aside', 'nav']): | |
| element.decompose() | |
| # Remove elements with specific classes that are typically navigation/sidebar | |
| remove_classes = [ | |
| 'sidebar', 'navigation', 'toctree', 'breadcrumb', | |
| 'headerlink', 'viewcode-link', 'edit-on-github' | |
| ] | |
| for class_name in remove_classes: | |
| for element in main_content.find_all(class_=class_name): | |
| element.decompose() | |
| # Extract text content | |
| text = main_content.get_text(separator='\n', strip=True) | |
| return self.clean_text(text) | |
| def scrape_page(self, url: str) -> Optional[Dict[str, str]]: | |
| """ | |
| Scrape a single page and return its content. | |
| Args: | |
| url (str): URL to scrape | |
| Returns: | |
| Dict[str, str]: Dictionary with 'url' and 'text' keys, or None if failed | |
| """ | |
| print(f"Scraping page: {url}") | |
| soup = self.get_page_content(url) | |
| if not soup: | |
| return None | |
| text_content = self.extract_main_content(soup) | |
| if len(text_content.strip()) < 100: # Skip pages with minimal content | |
| print(f"Skipping page with minimal content: {url}") | |
| return None | |
| return { | |
| "url": url, | |
| "text": text_content | |
| } | |
| def scrape_all(self, delay: float = 1.0) -> List[Dict[str, str]]: | |
| """ | |
| Scrape all user guide pages and return the content. | |
| Args: | |
| delay (float): Delay between requests in seconds | |
| Returns: | |
| List[Dict[str, str]]: List of dictionaries with scraped content | |
| """ | |
| print("Starting Scikit-learn documentation scraping...") | |
| # Get the main user guide page | |
| main_soup = self.get_page_content(self.base_url) | |
| if not main_soup: | |
| print("Failed to fetch main user guide page") | |
| return [] | |
| # Extract all user guide links | |
| links = self.extract_user_guide_links(main_soup) | |
| # Add the main page itself | |
| all_links = [self.base_url] + links | |
| scraped_content = [] | |
| for i, url in enumerate(all_links, 1): | |
| try: | |
| content = self.scrape_page(url) | |
| if content: | |
| scraped_content.append(content) | |
| print(f"Successfully scraped {i}/{len(all_links)}: {url}") | |
| else: | |
| print(f"Failed to scrape {i}/{len(all_links)}: {url}") | |
| # Add delay to be respectful to the server | |
| if i < len(all_links): | |
| time.sleep(delay) | |
| except KeyboardInterrupt: | |
| print("\nScraping interrupted by user") | |
| break | |
| except Exception as e: | |
| print(f"Unexpected error scraping {url}: {e}") | |
| continue | |
| print(f"\nScraping completed! Total pages scraped: {len(scraped_content)}") | |
| return scraped_content | |
| def save_to_json(self, content: List[Dict[str, str]], filename: str = "scraped_content.json"): | |
| """ | |
| Save scraped content to a JSON file. | |
| Args: | |
| content (List[Dict[str, str]]): Scraped content | |
| filename (str): Output filename | |
| """ | |
| try: | |
| with open(filename, 'w', encoding='utf-8') as f: | |
| json.dump(content, f, indent=2, ensure_ascii=False) | |
| print(f"Content saved to {filename}") | |
| except Exception as e: | |
| print(f"Error saving to {filename}: {e}") | |
| def main(): | |
| """ | |
| Main function to run the scraper. | |
| """ | |
| print("Scikit-learn Documentation Scraper") | |
| print("=" * 50) | |
| # Initialize scraper | |
| scraper = ScikitLearnScraper() | |
| # Scrape all content | |
| content = scraper.scrape_all(delay=1.0) | |
| if content: | |
| # Save to JSON file | |
| scraper.save_to_json(content) | |
| # Print summary | |
| total_chars = sum(len(item['text']) for item in content) | |
| print(f"\nSummary:") | |
| print(f"- Pages scraped: {len(content)}") | |
| print(f"- Total characters: {total_chars:,}") | |
| print(f"- Average characters per page: {total_chars // len(content):,}") | |
| else: | |
| print("No content was scraped successfully.") | |
| if __name__ == "__main__": | |
| main() |