scikit-rag / scraper.py
fguryel's picture
init
9222df3
#!/usr/bin/env python3
"""
Scikit-learn Documentation Scraper
This module scrapes the Scikit-learn User Guide documentation and saves
the content to a JSON file for use in a RAG application.
Author: AI Assistant
Date: September 2025
"""
import json
import re
import time
from typing import Dict, List, Optional
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
class ScikitLearnScraper:
"""
A web scraper for extracting content from Scikit-learn documentation.
This class handles the extraction of text content from the Scikit-learn
User Guide pages, with proper error handling and content cleaning.
"""
def __init__(self, base_url: str = "https://scikit-learn.org/stable/user_guide.html"):
"""
Initialize the scraper with the base URL.
Args:
base_url (str): The main User Guide URL to start scraping from
"""
self.base_url = base_url
self.base_domain = "https://scikit-learn.org"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Scikit-learn Documentation Scraper)'
})
def get_page_content(self, url: str, timeout: int = 10) -> Optional[BeautifulSoup]:
"""
Fetch and parse a web page.
Args:
url (str): URL to fetch
timeout (int): Request timeout in seconds
Returns:
BeautifulSoup: Parsed HTML content or None if failed
"""
try:
print(f"Fetching: {url}")
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
except Exception as e:
print(f"Unexpected error parsing {url}: {e}")
return None
def extract_user_guide_links(self, soup: BeautifulSoup) -> List[str]:
"""
Extract all user guide links from the main page.
Args:
soup (BeautifulSoup): Parsed HTML of the main user guide page
Returns:
List[str]: List of absolute URLs to user guide sections
"""
links = []
# Look for the main content area and find all links
# Scikit-learn user guide has links in the main content area
main_content = soup.find('div', class_='body') or soup.find('main') or soup
if main_content:
# Find all links that are part of the user guide
for link in main_content.find_all('a', href=True):
href = link.get('href')
# Filter for user guide links (typically contain specific patterns)
if href and (
href.startswith('modules/') or
href.startswith('supervised_learning') or
href.startswith('unsupervised_learning') or
href.startswith('model_selection') or
href.startswith('data_transforms') or
href.startswith('datasets/') or
href.startswith('computing/') or
href.startswith('model_persistence') or
'user_guide' in href
):
# Convert to absolute URL
absolute_url = urljoin(self.base_domain + '/stable/', href)
# Ensure it's an HTML page
if absolute_url.endswith('.html') or not '.' in absolute_url.split('/')[-1]:
links.append(absolute_url)
# Also look for table of contents or navigation menus
toc_sections = soup.find_all(['div', 'nav'], class_=['toctree-wrapper', 'navigation', 'sidebar'])
for section in toc_sections:
for link in section.find_all('a', href=True):
href = link.get('href')
if href and not href.startswith('#') and not href.startswith('http'):
absolute_url = urljoin(self.base_domain + '/stable/', href)
if absolute_url.endswith('.html') and 'user_guide' in absolute_url:
links.append(absolute_url)
# Remove duplicates and sort
unique_links = list(set(links))
unique_links.sort()
print(f"Found {len(unique_links)} user guide links")
return unique_links
def clean_text(self, text: str) -> str:
"""
Clean and normalize extracted text content.
Args:
text (str): Raw text content
Returns:
str: Cleaned text content
"""
# Remove excessive whitespace and newlines
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
text = re.sub(r'[ \t]+', ' ', text)
# Remove leading/trailing whitespace from each line
lines = [line.strip() for line in text.split('\n')]
text = '\n'.join(lines)
# Remove excessive blank lines at start/end
text = text.strip()
return text
def extract_main_content(self, soup: BeautifulSoup) -> str:
"""
Extract the main text content from a documentation page.
Args:
soup (BeautifulSoup): Parsed HTML of the page
Returns:
str: Cleaned main text content
"""
# Remove script and style elements
for element in soup(['script', 'style', 'nav', 'header', 'footer']):
element.decompose()
# Try to find the main content area (common selectors for documentation sites)
main_content = None
content_selectors = [
'div.body',
'div.document',
'main',
'div.content',
'div.main-content',
'article',
'div.rst-content'
]
for selector in content_selectors:
main_content = soup.select_one(selector)
if main_content:
break
# If no main content found, use the whole body
if not main_content:
main_content = soup.find('body') or soup
# Remove sidebar and navigation elements
for element in main_content.find_all(['aside', 'nav']):
element.decompose()
# Remove elements with specific classes that are typically navigation/sidebar
remove_classes = [
'sidebar', 'navigation', 'toctree', 'breadcrumb',
'headerlink', 'viewcode-link', 'edit-on-github'
]
for class_name in remove_classes:
for element in main_content.find_all(class_=class_name):
element.decompose()
# Extract text content
text = main_content.get_text(separator='\n', strip=True)
return self.clean_text(text)
def scrape_page(self, url: str) -> Optional[Dict[str, str]]:
"""
Scrape a single page and return its content.
Args:
url (str): URL to scrape
Returns:
Dict[str, str]: Dictionary with 'url' and 'text' keys, or None if failed
"""
print(f"Scraping page: {url}")
soup = self.get_page_content(url)
if not soup:
return None
text_content = self.extract_main_content(soup)
if len(text_content.strip()) < 100: # Skip pages with minimal content
print(f"Skipping page with minimal content: {url}")
return None
return {
"url": url,
"text": text_content
}
def scrape_all(self, delay: float = 1.0) -> List[Dict[str, str]]:
"""
Scrape all user guide pages and return the content.
Args:
delay (float): Delay between requests in seconds
Returns:
List[Dict[str, str]]: List of dictionaries with scraped content
"""
print("Starting Scikit-learn documentation scraping...")
# Get the main user guide page
main_soup = self.get_page_content(self.base_url)
if not main_soup:
print("Failed to fetch main user guide page")
return []
# Extract all user guide links
links = self.extract_user_guide_links(main_soup)
# Add the main page itself
all_links = [self.base_url] + links
scraped_content = []
for i, url in enumerate(all_links, 1):
try:
content = self.scrape_page(url)
if content:
scraped_content.append(content)
print(f"Successfully scraped {i}/{len(all_links)}: {url}")
else:
print(f"Failed to scrape {i}/{len(all_links)}: {url}")
# Add delay to be respectful to the server
if i < len(all_links):
time.sleep(delay)
except KeyboardInterrupt:
print("\nScraping interrupted by user")
break
except Exception as e:
print(f"Unexpected error scraping {url}: {e}")
continue
print(f"\nScraping completed! Total pages scraped: {len(scraped_content)}")
return scraped_content
def save_to_json(self, content: List[Dict[str, str]], filename: str = "scraped_content.json"):
"""
Save scraped content to a JSON file.
Args:
content (List[Dict[str, str]]): Scraped content
filename (str): Output filename
"""
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(content, f, indent=2, ensure_ascii=False)
print(f"Content saved to {filename}")
except Exception as e:
print(f"Error saving to {filename}: {e}")
def main():
"""
Main function to run the scraper.
"""
print("Scikit-learn Documentation Scraper")
print("=" * 50)
# Initialize scraper
scraper = ScikitLearnScraper()
# Scrape all content
content = scraper.scrape_all(delay=1.0)
if content:
# Save to JSON file
scraper.save_to_json(content)
# Print summary
total_chars = sum(len(item['text']) for item in content)
print(f"\nSummary:")
print(f"- Pages scraped: {len(content)}")
print(f"- Total characters: {total_chars:,}")
print(f"- Average characters per page: {total_chars // len(content):,}")
else:
print("No content was scraped successfully.")
if __name__ == "__main__":
main()