#!/usr/bin/env python3 """ Text Chunker for Scikit-learn Documentation This module processes the scraped Scikit-learn documentation and chunks it into smaller, manageable pieces for use in a RAG application. Author: AI Assistant Date: September 2025 """ import json import logging from typing import Dict, List, Any from pathlib import Path from langchain_text_splitters import RecursiveCharacterTextSplitter # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class DocumentChunker: """ A class for chunking scraped documentation into smaller pieces. This class handles the process of splitting long documents into manageable chunks while preserving metadata and context. """ def __init__( self, chunk_size: int = 1000, chunk_overlap: int = 150, separators: List[str] = None ): """ Initialize the DocumentChunker. Args: chunk_size (int): Target size for each chunk in characters chunk_overlap (int): Number of characters to overlap between chunks separators (List[str]): Custom separators for text splitting """ self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap # Use custom separators or defaults optimized for documentation if separators is None: separators = [ "\n\n", # Double newlines (paragraphs) "\n", # Single newlines ". ", # Sentences "! ", # Exclamations "? ", # Questions "; ", # Semicolons ", ", # Commas " ", # Spaces "" # Characters (last resort) ] # Initialize the text splitter self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=separators, length_function=len, ) logger.info(f"Initialized chunker with size={chunk_size}, overlap={chunk_overlap}") def load_scraped_content(self, filepath: str) -> List[Dict[str, str]]: """ Load scraped content from JSON file. Args: filepath (str): Path to the scraped content JSON file Returns: List[Dict[str, str]]: List of documents with 'url' and 'text' keys Raises: FileNotFoundError: If the file doesn't exist json.JSONDecodeError: If the file is not valid JSON """ filepath = Path(filepath) if not filepath.exists(): raise FileNotFoundError(f"Scraped content file not found: {filepath}") logger.info(f"Loading scraped content from {filepath}") try: with open(filepath, 'r', encoding='utf-8') as f: content = json.load(f) logger.info(f"Loaded {len(content)} documents from {filepath}") return content except json.JSONDecodeError as e: logger.error(f"Invalid JSON in {filepath}: {e}") raise except Exception as e: logger.error(f"Error loading {filepath}: {e}") raise def create_chunk_metadata(self, original_url: str, chunk_index: int) -> Dict[str, Any]: """ Create metadata for a chunk. Args: original_url (str): URL of the original document chunk_index (int): Index of this chunk within the document Returns: Dict[str, Any]: Metadata dictionary """ return { "url": original_url, "chunk_index": chunk_index, "source": "scikit-learn-docs" } def chunk_document(self, document: Dict[str, str]) -> List[Dict[str, Any]]: """ Chunk a single document into smaller pieces. Args: document (Dict[str, str]): Document with 'url' and 'text' keys Returns: List[Dict[str, Any]]: List of chunks with 'page_content' and 'metadata' keys """ url = document['url'] text = document['text'] # Skip documents with minimal content if len(text.strip()) < 100: logger.warning(f"Skipping document with minimal content: {url}") return [] logger.info(f"Chunking document: {url} ({len(text)} characters)") # Split the text into chunks text_chunks = self.text_splitter.split_text(text) # Create chunk objects with metadata chunks = [] for i, chunk_text in enumerate(text_chunks): # Skip very small chunks if len(chunk_text.strip()) < 50: continue chunk = { "page_content": chunk_text.strip(), "metadata": self.create_chunk_metadata(url, i) } chunks.append(chunk) logger.info(f"Created {len(chunks)} chunks from {url}") return chunks def chunk_all_documents(self, documents: List[Dict[str, str]]) -> List[Dict[str, Any]]: """ Chunk all documents in the collection. Args: documents (List[Dict[str, str]]): List of documents to chunk Returns: List[Dict[str, Any]]: List of all chunks from all documents """ logger.info(f"Starting to chunk {len(documents)} documents") all_chunks = [] total_chars_processed = 0 for doc_index, document in enumerate(documents, 1): try: doc_chunks = self.chunk_document(document) all_chunks.extend(doc_chunks) # Track progress total_chars_processed += len(document['text']) if doc_index % 10 == 0: logger.info(f"Processed {doc_index}/{len(documents)} documents") except Exception as e: logger.error(f"Error chunking document {document.get('url', 'unknown')}: {e}") continue logger.info(f"Chunking completed:") logger.info(f" - Documents processed: {len(documents)}") logger.info(f" - Total chunks created: {len(all_chunks)}") logger.info(f" - Total characters processed: {total_chars_processed:,}") logger.info(f" - Average chunks per document: {len(all_chunks) / len(documents):.1f}") return all_chunks def save_chunks(self, chunks: List[Dict[str, Any]], filepath: str): """ Save chunks to a JSON file. Args: chunks (List[Dict[str, Any]]): List of chunks to save filepath (str): Output file path """ filepath = Path(filepath) logger.info(f"Saving {len(chunks)} chunks to {filepath}") try: with open(filepath, 'w', encoding='utf-8') as f: json.dump(chunks, f, indent=2, ensure_ascii=False) # Calculate file size file_size = filepath.stat().st_size / (1024 * 1024) # MB logger.info(f"Chunks saved successfully ({file_size:.2f} MB)") except Exception as e: logger.error(f"Error saving chunks to {filepath}: {e}") raise def get_chunk_statistics(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]: """ Calculate statistics about the chunks. Args: chunks (List[Dict[str, Any]]): List of chunks Returns: Dict[str, Any]: Statistics dictionary """ if not chunks: return {} chunk_lengths = [len(chunk['page_content']) for chunk in chunks] unique_urls = set(chunk['metadata']['url'] for chunk in chunks) stats = { "total_chunks": len(chunks), "unique_documents": len(unique_urls), "avg_chunk_length": sum(chunk_lengths) / len(chunk_lengths), "min_chunk_length": min(chunk_lengths), "max_chunk_length": max(chunk_lengths), "total_characters": sum(chunk_lengths) } return stats def process_and_save( self, input_filepath: str, output_filepath: str = "chunks.json" ) -> Dict[str, Any]: """ Complete processing pipeline: load, chunk, and save. Args: input_filepath (str): Path to scraped content JSON output_filepath (str): Path to save chunks JSON Returns: Dict[str, Any]: Processing statistics """ logger.info("Starting document chunking pipeline") # Load scraped content documents = self.load_scraped_content(input_filepath) # Chunk all documents chunks = self.chunk_all_documents(documents) # Save chunks self.save_chunks(chunks, output_filepath) # Calculate and return statistics stats = self.get_chunk_statistics(chunks) logger.info("Chunking pipeline completed successfully") return stats def main(): """ Main function to run the chunking process. """ print("Scikit-learn Documentation Chunker") print("=" * 50) # Configuration input_file = "scraped_content.json" output_file = "chunks.json" # Initialize chunker with optimal settings for documentation chunker = DocumentChunker( chunk_size=1000, chunk_overlap=150 ) try: # Process documents stats = chunker.process_and_save(input_file, output_file) # Display results print(f"\nProcessing Results:") print(f" šŸ“„ Total chunks created: {stats['total_chunks']:,}") print(f" šŸ“š Unique documents: {stats['unique_documents']}") print(f" šŸ“ Average chunk length: {stats['avg_chunk_length']:.0f} characters") print(f" šŸ“Š Min/Max chunk length: {stats['min_chunk_length']}/{stats['max_chunk_length']}") print(f" šŸ’¾ Total characters: {stats['total_characters']:,}") print(f" āœ… Chunks saved to: {output_file}") except Exception as e: logger.error(f"Pipeline failed: {e}") print(f"\nāŒ Error: {e}") return 1 return 0 if __name__ == "__main__": exit(main())