scikit-rag / chunker.py
fguryel's picture
init
9222df3
#!/usr/bin/env python3
"""
Text Chunker for Scikit-learn Documentation
This module processes the scraped Scikit-learn documentation and chunks it
into smaller, manageable pieces for use in a RAG application.
Author: AI Assistant
Date: September 2025
"""
import json
import logging
from typing import Dict, List, Any
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class DocumentChunker:
"""
A class for chunking scraped documentation into smaller pieces.
This class handles the process of splitting long documents into
manageable chunks while preserving metadata and context.
"""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 150,
separators: List[str] = None
):
"""
Initialize the DocumentChunker.
Args:
chunk_size (int): Target size for each chunk in characters
chunk_overlap (int): Number of characters to overlap between chunks
separators (List[str]): Custom separators for text splitting
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
# Use custom separators or defaults optimized for documentation
if separators is None:
separators = [
"\n\n", # Double newlines (paragraphs)
"\n", # Single newlines
". ", # Sentences
"! ", # Exclamations
"? ", # Questions
"; ", # Semicolons
", ", # Commas
" ", # Spaces
"" # Characters (last resort)
]
# Initialize the text splitter
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
length_function=len,
)
logger.info(f"Initialized chunker with size={chunk_size}, overlap={chunk_overlap}")
def load_scraped_content(self, filepath: str) -> List[Dict[str, str]]:
"""
Load scraped content from JSON file.
Args:
filepath (str): Path to the scraped content JSON file
Returns:
List[Dict[str, str]]: List of documents with 'url' and 'text' keys
Raises:
FileNotFoundError: If the file doesn't exist
json.JSONDecodeError: If the file is not valid JSON
"""
filepath = Path(filepath)
if not filepath.exists():
raise FileNotFoundError(f"Scraped content file not found: {filepath}")
logger.info(f"Loading scraped content from {filepath}")
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = json.load(f)
logger.info(f"Loaded {len(content)} documents from {filepath}")
return content
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON in {filepath}: {e}")
raise
except Exception as e:
logger.error(f"Error loading {filepath}: {e}")
raise
def create_chunk_metadata(self, original_url: str, chunk_index: int) -> Dict[str, Any]:
"""
Create metadata for a chunk.
Args:
original_url (str): URL of the original document
chunk_index (int): Index of this chunk within the document
Returns:
Dict[str, Any]: Metadata dictionary
"""
return {
"url": original_url,
"chunk_index": chunk_index,
"source": "scikit-learn-docs"
}
def chunk_document(self, document: Dict[str, str]) -> List[Dict[str, Any]]:
"""
Chunk a single document into smaller pieces.
Args:
document (Dict[str, str]): Document with 'url' and 'text' keys
Returns:
List[Dict[str, Any]]: List of chunks with 'page_content' and 'metadata' keys
"""
url = document['url']
text = document['text']
# Skip documents with minimal content
if len(text.strip()) < 100:
logger.warning(f"Skipping document with minimal content: {url}")
return []
logger.info(f"Chunking document: {url} ({len(text)} characters)")
# Split the text into chunks
text_chunks = self.text_splitter.split_text(text)
# Create chunk objects with metadata
chunks = []
for i, chunk_text in enumerate(text_chunks):
# Skip very small chunks
if len(chunk_text.strip()) < 50:
continue
chunk = {
"page_content": chunk_text.strip(),
"metadata": self.create_chunk_metadata(url, i)
}
chunks.append(chunk)
logger.info(f"Created {len(chunks)} chunks from {url}")
return chunks
def chunk_all_documents(self, documents: List[Dict[str, str]]) -> List[Dict[str, Any]]:
"""
Chunk all documents in the collection.
Args:
documents (List[Dict[str, str]]): List of documents to chunk
Returns:
List[Dict[str, Any]]: List of all chunks from all documents
"""
logger.info(f"Starting to chunk {len(documents)} documents")
all_chunks = []
total_chars_processed = 0
for doc_index, document in enumerate(documents, 1):
try:
doc_chunks = self.chunk_document(document)
all_chunks.extend(doc_chunks)
# Track progress
total_chars_processed += len(document['text'])
if doc_index % 10 == 0:
logger.info(f"Processed {doc_index}/{len(documents)} documents")
except Exception as e:
logger.error(f"Error chunking document {document.get('url', 'unknown')}: {e}")
continue
logger.info(f"Chunking completed:")
logger.info(f" - Documents processed: {len(documents)}")
logger.info(f" - Total chunks created: {len(all_chunks)}")
logger.info(f" - Total characters processed: {total_chars_processed:,}")
logger.info(f" - Average chunks per document: {len(all_chunks) / len(documents):.1f}")
return all_chunks
def save_chunks(self, chunks: List[Dict[str, Any]], filepath: str):
"""
Save chunks to a JSON file.
Args:
chunks (List[Dict[str, Any]]): List of chunks to save
filepath (str): Output file path
"""
filepath = Path(filepath)
logger.info(f"Saving {len(chunks)} chunks to {filepath}")
try:
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(chunks, f, indent=2, ensure_ascii=False)
# Calculate file size
file_size = filepath.stat().st_size / (1024 * 1024) # MB
logger.info(f"Chunks saved successfully ({file_size:.2f} MB)")
except Exception as e:
logger.error(f"Error saving chunks to {filepath}: {e}")
raise
def get_chunk_statistics(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Calculate statistics about the chunks.
Args:
chunks (List[Dict[str, Any]]): List of chunks
Returns:
Dict[str, Any]: Statistics dictionary
"""
if not chunks:
return {}
chunk_lengths = [len(chunk['page_content']) for chunk in chunks]
unique_urls = set(chunk['metadata']['url'] for chunk in chunks)
stats = {
"total_chunks": len(chunks),
"unique_documents": len(unique_urls),
"avg_chunk_length": sum(chunk_lengths) / len(chunk_lengths),
"min_chunk_length": min(chunk_lengths),
"max_chunk_length": max(chunk_lengths),
"total_characters": sum(chunk_lengths)
}
return stats
def process_and_save(
self,
input_filepath: str,
output_filepath: str = "chunks.json"
) -> Dict[str, Any]:
"""
Complete processing pipeline: load, chunk, and save.
Args:
input_filepath (str): Path to scraped content JSON
output_filepath (str): Path to save chunks JSON
Returns:
Dict[str, Any]: Processing statistics
"""
logger.info("Starting document chunking pipeline")
# Load scraped content
documents = self.load_scraped_content(input_filepath)
# Chunk all documents
chunks = self.chunk_all_documents(documents)
# Save chunks
self.save_chunks(chunks, output_filepath)
# Calculate and return statistics
stats = self.get_chunk_statistics(chunks)
logger.info("Chunking pipeline completed successfully")
return stats
def main():
"""
Main function to run the chunking process.
"""
print("Scikit-learn Documentation Chunker")
print("=" * 50)
# Configuration
input_file = "scraped_content.json"
output_file = "chunks.json"
# Initialize chunker with optimal settings for documentation
chunker = DocumentChunker(
chunk_size=1000,
chunk_overlap=150
)
try:
# Process documents
stats = chunker.process_and_save(input_file, output_file)
# Display results
print(f"\nProcessing Results:")
print(f" πŸ“„ Total chunks created: {stats['total_chunks']:,}")
print(f" πŸ“š Unique documents: {stats['unique_documents']}")
print(f" πŸ“ Average chunk length: {stats['avg_chunk_length']:.0f} characters")
print(f" πŸ“Š Min/Max chunk length: {stats['min_chunk_length']}/{stats['max_chunk_length']}")
print(f" πŸ’Ύ Total characters: {stats['total_characters']:,}")
print(f" βœ… Chunks saved to: {output_file}")
except Exception as e:
logger.error(f"Pipeline failed: {e}")
print(f"\n❌ Error: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())