Spaces:

fguryel
/

scikit-rag

Sleeping

File size: 10,881 Bytes

9222df3

#!/usr/bin/env python3
"""
Text Chunker for Scikit-learn Documentation

This module processes the scraped Scikit-learn documentation and chunks it
into smaller, manageable pieces for use in a RAG application.

Author: AI Assistant
Date: September 2025
"""

import json
import logging
from typing import Dict, List, Any
from pathlib import Path

from langchain_text_splitters import RecursiveCharacterTextSplitter


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class DocumentChunker:
    """
    A class for chunking scraped documentation into smaller pieces.
    
    This class handles the process of splitting long documents into
    manageable chunks while preserving metadata and context.
    """
    
    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 150,
        separators: List[str] = None
    ):
        """
        Initialize the DocumentChunker.
        
        Args:
            chunk_size (int): Target size for each chunk in characters
            chunk_overlap (int): Number of characters to overlap between chunks
            separators (List[str]): Custom separators for text splitting
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        
        # Use custom separators or defaults optimized for documentation
        if separators is None:
            separators = [
                "\n\n",      # Double newlines (paragraphs)
                "\n",        # Single newlines
                ". ",        # Sentences
                "! ",        # Exclamations
                "? ",        # Questions
                "; ",        # Semicolons
                ", ",        # Commas
                " ",         # Spaces
                ""           # Characters (last resort)
            ]
        
        # Initialize the text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators,
            length_function=len,
        )
        
        logger.info(f"Initialized chunker with size={chunk_size}, overlap={chunk_overlap}")
    
    def load_scraped_content(self, filepath: str) -> List[Dict[str, str]]:
        """
        Load scraped content from JSON file.
        
        Args:
            filepath (str): Path to the scraped content JSON file
            
        Returns:
            List[Dict[str, str]]: List of documents with 'url' and 'text' keys
            
        Raises:
            FileNotFoundError: If the file doesn't exist
            json.JSONDecodeError: If the file is not valid JSON
        """
        filepath = Path(filepath)
        
        if not filepath.exists():
            raise FileNotFoundError(f"Scraped content file not found: {filepath}")
        
        logger.info(f"Loading scraped content from {filepath}")
        
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = json.load(f)
            
            logger.info(f"Loaded {len(content)} documents from {filepath}")
            return content
            
        except json.JSONDecodeError as e:
            logger.error(f"Invalid JSON in {filepath}: {e}")
            raise
        except Exception as e:
            logger.error(f"Error loading {filepath}: {e}")
            raise
    
    def create_chunk_metadata(self, original_url: str, chunk_index: int) -> Dict[str, Any]:
        """
        Create metadata for a chunk.
        
        Args:
            original_url (str): URL of the original document
            chunk_index (int): Index of this chunk within the document
            
        Returns:
            Dict[str, Any]: Metadata dictionary
        """
        return {
            "url": original_url,
            "chunk_index": chunk_index,
            "source": "scikit-learn-docs"
        }
    
    def chunk_document(self, document: Dict[str, str]) -> List[Dict[str, Any]]:
        """
        Chunk a single document into smaller pieces.
        
        Args:
            document (Dict[str, str]): Document with 'url' and 'text' keys
            
        Returns:
            List[Dict[str, Any]]: List of chunks with 'page_content' and 'metadata' keys
        """
        url = document['url']
        text = document['text']
        
        # Skip documents with minimal content
        if len(text.strip()) < 100:
            logger.warning(f"Skipping document with minimal content: {url}")
            return []
        
        logger.info(f"Chunking document: {url} ({len(text)} characters)")
        
        # Split the text into chunks
        text_chunks = self.text_splitter.split_text(text)
        
        # Create chunk objects with metadata
        chunks = []
        for i, chunk_text in enumerate(text_chunks):
            # Skip very small chunks
            if len(chunk_text.strip()) < 50:
                continue
                
            chunk = {
                "page_content": chunk_text.strip(),
                "metadata": self.create_chunk_metadata(url, i)
            }
            chunks.append(chunk)
        
        logger.info(f"Created {len(chunks)} chunks from {url}")
        return chunks
    
    def chunk_all_documents(self, documents: List[Dict[str, str]]) -> List[Dict[str, Any]]:
        """
        Chunk all documents in the collection.
        
        Args:
            documents (List[Dict[str, str]]): List of documents to chunk
            
        Returns:
            List[Dict[str, Any]]: List of all chunks from all documents
        """
        logger.info(f"Starting to chunk {len(documents)} documents")
        
        all_chunks = []
        total_chars_processed = 0
        
        for doc_index, document in enumerate(documents, 1):
            try:
                doc_chunks = self.chunk_document(document)
                all_chunks.extend(doc_chunks)
                
                # Track progress
                total_chars_processed += len(document['text'])
                
                if doc_index % 10 == 0:
                    logger.info(f"Processed {doc_index}/{len(documents)} documents")
                    
            except Exception as e:
                logger.error(f"Error chunking document {document.get('url', 'unknown')}: {e}")
                continue
        
        logger.info(f"Chunking completed:")
        logger.info(f"  - Documents processed: {len(documents)}")
        logger.info(f"  - Total chunks created: {len(all_chunks)}")
        logger.info(f"  - Total characters processed: {total_chars_processed:,}")
        logger.info(f"  - Average chunks per document: {len(all_chunks) / len(documents):.1f}")
        
        return all_chunks
    
    def save_chunks(self, chunks: List[Dict[str, Any]], filepath: str):
        """
        Save chunks to a JSON file.
        
        Args:
            chunks (List[Dict[str, Any]]): List of chunks to save
            filepath (str): Output file path
        """
        filepath = Path(filepath)
        
        logger.info(f"Saving {len(chunks)} chunks to {filepath}")
        
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(chunks, f, indent=2, ensure_ascii=False)
            
            # Calculate file size
            file_size = filepath.stat().st_size / (1024 * 1024)  # MB
            logger.info(f"Chunks saved successfully ({file_size:.2f} MB)")
            
        except Exception as e:
            logger.error(f"Error saving chunks to {filepath}: {e}")
            raise
    
    def get_chunk_statistics(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Calculate statistics about the chunks.
        
        Args:
            chunks (List[Dict[str, Any]]): List of chunks
            
        Returns:
            Dict[str, Any]: Statistics dictionary
        """
        if not chunks:
            return {}
        
        chunk_lengths = [len(chunk['page_content']) for chunk in chunks]
        unique_urls = set(chunk['metadata']['url'] for chunk in chunks)
        
        stats = {
            "total_chunks": len(chunks),
            "unique_documents": len(unique_urls),
            "avg_chunk_length": sum(chunk_lengths) / len(chunk_lengths),
            "min_chunk_length": min(chunk_lengths),
            "max_chunk_length": max(chunk_lengths),
            "total_characters": sum(chunk_lengths)
        }
        
        return stats
    
    def process_and_save(
        self,
        input_filepath: str,
        output_filepath: str = "chunks.json"
    ) -> Dict[str, Any]:
        """
        Complete processing pipeline: load, chunk, and save.
        
        Args:
            input_filepath (str): Path to scraped content JSON
            output_filepath (str): Path to save chunks JSON
            
        Returns:
            Dict[str, Any]: Processing statistics
        """
        logger.info("Starting document chunking pipeline")
        
        # Load scraped content
        documents = self.load_scraped_content(input_filepath)
        
        # Chunk all documents
        chunks = self.chunk_all_documents(documents)
        
        # Save chunks
        self.save_chunks(chunks, output_filepath)
        
        # Calculate and return statistics
        stats = self.get_chunk_statistics(chunks)
        
        logger.info("Chunking pipeline completed successfully")
        return stats


def main():
    """
    Main function to run the chunking process.
    """
    print("Scikit-learn Documentation Chunker")
    print("=" * 50)
    
    # Configuration
    input_file = "scraped_content.json"
    output_file = "chunks.json"
    
    # Initialize chunker with optimal settings for documentation
    chunker = DocumentChunker(
        chunk_size=1000,
        chunk_overlap=150
    )
    
    try:
        # Process documents
        stats = chunker.process_and_save(input_file, output_file)
        
        # Display results
        print(f"\nProcessing Results:")
        print(f"  📄 Total chunks created: {stats['total_chunks']:,}")
        print(f"  📚 Unique documents: {stats['unique_documents']}")
        print(f"  📏 Average chunk length: {stats['avg_chunk_length']:.0f} characters")
        print(f"  📊 Min/Max chunk length: {stats['min_chunk_length']}/{stats['max_chunk_length']}")
        print(f"  💾 Total characters: {stats['total_characters']:,}")
        print(f"  ✅ Chunks saved to: {output_file}")
        
    except Exception as e:
        logger.error(f"Pipeline failed: {e}")
        print(f"\n❌ Error: {e}")
        return 1
    
    return 0


if __name__ == "__main__":
    exit(main())