Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Text Chunker for Scikit-learn Documentation | |
| This module processes the scraped Scikit-learn documentation and chunks it | |
| into smaller, manageable pieces for use in a RAG application. | |
| Author: AI Assistant | |
| Date: September 2025 | |
| """ | |
| import json | |
| import logging | |
| from typing import Dict, List, Any | |
| from pathlib import Path | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class DocumentChunker: | |
| """ | |
| A class for chunking scraped documentation into smaller pieces. | |
| This class handles the process of splitting long documents into | |
| manageable chunks while preserving metadata and context. | |
| """ | |
| def __init__( | |
| self, | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 150, | |
| separators: List[str] = None | |
| ): | |
| """ | |
| Initialize the DocumentChunker. | |
| Args: | |
| chunk_size (int): Target size for each chunk in characters | |
| chunk_overlap (int): Number of characters to overlap between chunks | |
| separators (List[str]): Custom separators for text splitting | |
| """ | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| # Use custom separators or defaults optimized for documentation | |
| if separators is None: | |
| separators = [ | |
| "\n\n", # Double newlines (paragraphs) | |
| "\n", # Single newlines | |
| ". ", # Sentences | |
| "! ", # Exclamations | |
| "? ", # Questions | |
| "; ", # Semicolons | |
| ", ", # Commas | |
| " ", # Spaces | |
| "" # Characters (last resort) | |
| ] | |
| # Initialize the text splitter | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=separators, | |
| length_function=len, | |
| ) | |
| logger.info(f"Initialized chunker with size={chunk_size}, overlap={chunk_overlap}") | |
| def load_scraped_content(self, filepath: str) -> List[Dict[str, str]]: | |
| """ | |
| Load scraped content from JSON file. | |
| Args: | |
| filepath (str): Path to the scraped content JSON file | |
| Returns: | |
| List[Dict[str, str]]: List of documents with 'url' and 'text' keys | |
| Raises: | |
| FileNotFoundError: If the file doesn't exist | |
| json.JSONDecodeError: If the file is not valid JSON | |
| """ | |
| filepath = Path(filepath) | |
| if not filepath.exists(): | |
| raise FileNotFoundError(f"Scraped content file not found: {filepath}") | |
| logger.info(f"Loading scraped content from {filepath}") | |
| try: | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| content = json.load(f) | |
| logger.info(f"Loaded {len(content)} documents from {filepath}") | |
| return content | |
| except json.JSONDecodeError as e: | |
| logger.error(f"Invalid JSON in {filepath}: {e}") | |
| raise | |
| except Exception as e: | |
| logger.error(f"Error loading {filepath}: {e}") | |
| raise | |
| def create_chunk_metadata(self, original_url: str, chunk_index: int) -> Dict[str, Any]: | |
| """ | |
| Create metadata for a chunk. | |
| Args: | |
| original_url (str): URL of the original document | |
| chunk_index (int): Index of this chunk within the document | |
| Returns: | |
| Dict[str, Any]: Metadata dictionary | |
| """ | |
| return { | |
| "url": original_url, | |
| "chunk_index": chunk_index, | |
| "source": "scikit-learn-docs" | |
| } | |
| def chunk_document(self, document: Dict[str, str]) -> List[Dict[str, Any]]: | |
| """ | |
| Chunk a single document into smaller pieces. | |
| Args: | |
| document (Dict[str, str]): Document with 'url' and 'text' keys | |
| Returns: | |
| List[Dict[str, Any]]: List of chunks with 'page_content' and 'metadata' keys | |
| """ | |
| url = document['url'] | |
| text = document['text'] | |
| # Skip documents with minimal content | |
| if len(text.strip()) < 100: | |
| logger.warning(f"Skipping document with minimal content: {url}") | |
| return [] | |
| logger.info(f"Chunking document: {url} ({len(text)} characters)") | |
| # Split the text into chunks | |
| text_chunks = self.text_splitter.split_text(text) | |
| # Create chunk objects with metadata | |
| chunks = [] | |
| for i, chunk_text in enumerate(text_chunks): | |
| # Skip very small chunks | |
| if len(chunk_text.strip()) < 50: | |
| continue | |
| chunk = { | |
| "page_content": chunk_text.strip(), | |
| "metadata": self.create_chunk_metadata(url, i) | |
| } | |
| chunks.append(chunk) | |
| logger.info(f"Created {len(chunks)} chunks from {url}") | |
| return chunks | |
| def chunk_all_documents(self, documents: List[Dict[str, str]]) -> List[Dict[str, Any]]: | |
| """ | |
| Chunk all documents in the collection. | |
| Args: | |
| documents (List[Dict[str, str]]): List of documents to chunk | |
| Returns: | |
| List[Dict[str, Any]]: List of all chunks from all documents | |
| """ | |
| logger.info(f"Starting to chunk {len(documents)} documents") | |
| all_chunks = [] | |
| total_chars_processed = 0 | |
| for doc_index, document in enumerate(documents, 1): | |
| try: | |
| doc_chunks = self.chunk_document(document) | |
| all_chunks.extend(doc_chunks) | |
| # Track progress | |
| total_chars_processed += len(document['text']) | |
| if doc_index % 10 == 0: | |
| logger.info(f"Processed {doc_index}/{len(documents)} documents") | |
| except Exception as e: | |
| logger.error(f"Error chunking document {document.get('url', 'unknown')}: {e}") | |
| continue | |
| logger.info(f"Chunking completed:") | |
| logger.info(f" - Documents processed: {len(documents)}") | |
| logger.info(f" - Total chunks created: {len(all_chunks)}") | |
| logger.info(f" - Total characters processed: {total_chars_processed:,}") | |
| logger.info(f" - Average chunks per document: {len(all_chunks) / len(documents):.1f}") | |
| return all_chunks | |
| def save_chunks(self, chunks: List[Dict[str, Any]], filepath: str): | |
| """ | |
| Save chunks to a JSON file. | |
| Args: | |
| chunks (List[Dict[str, Any]]): List of chunks to save | |
| filepath (str): Output file path | |
| """ | |
| filepath = Path(filepath) | |
| logger.info(f"Saving {len(chunks)} chunks to {filepath}") | |
| try: | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(chunks, f, indent=2, ensure_ascii=False) | |
| # Calculate file size | |
| file_size = filepath.stat().st_size / (1024 * 1024) # MB | |
| logger.info(f"Chunks saved successfully ({file_size:.2f} MB)") | |
| except Exception as e: | |
| logger.error(f"Error saving chunks to {filepath}: {e}") | |
| raise | |
| def get_chunk_statistics(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]: | |
| """ | |
| Calculate statistics about the chunks. | |
| Args: | |
| chunks (List[Dict[str, Any]]): List of chunks | |
| Returns: | |
| Dict[str, Any]: Statistics dictionary | |
| """ | |
| if not chunks: | |
| return {} | |
| chunk_lengths = [len(chunk['page_content']) for chunk in chunks] | |
| unique_urls = set(chunk['metadata']['url'] for chunk in chunks) | |
| stats = { | |
| "total_chunks": len(chunks), | |
| "unique_documents": len(unique_urls), | |
| "avg_chunk_length": sum(chunk_lengths) / len(chunk_lengths), | |
| "min_chunk_length": min(chunk_lengths), | |
| "max_chunk_length": max(chunk_lengths), | |
| "total_characters": sum(chunk_lengths) | |
| } | |
| return stats | |
| def process_and_save( | |
| self, | |
| input_filepath: str, | |
| output_filepath: str = "chunks.json" | |
| ) -> Dict[str, Any]: | |
| """ | |
| Complete processing pipeline: load, chunk, and save. | |
| Args: | |
| input_filepath (str): Path to scraped content JSON | |
| output_filepath (str): Path to save chunks JSON | |
| Returns: | |
| Dict[str, Any]: Processing statistics | |
| """ | |
| logger.info("Starting document chunking pipeline") | |
| # Load scraped content | |
| documents = self.load_scraped_content(input_filepath) | |
| # Chunk all documents | |
| chunks = self.chunk_all_documents(documents) | |
| # Save chunks | |
| self.save_chunks(chunks, output_filepath) | |
| # Calculate and return statistics | |
| stats = self.get_chunk_statistics(chunks) | |
| logger.info("Chunking pipeline completed successfully") | |
| return stats | |
| def main(): | |
| """ | |
| Main function to run the chunking process. | |
| """ | |
| print("Scikit-learn Documentation Chunker") | |
| print("=" * 50) | |
| # Configuration | |
| input_file = "scraped_content.json" | |
| output_file = "chunks.json" | |
| # Initialize chunker with optimal settings for documentation | |
| chunker = DocumentChunker( | |
| chunk_size=1000, | |
| chunk_overlap=150 | |
| ) | |
| try: | |
| # Process documents | |
| stats = chunker.process_and_save(input_file, output_file) | |
| # Display results | |
| print(f"\nProcessing Results:") | |
| print(f" π Total chunks created: {stats['total_chunks']:,}") | |
| print(f" π Unique documents: {stats['unique_documents']}") | |
| print(f" π Average chunk length: {stats['avg_chunk_length']:.0f} characters") | |
| print(f" π Min/Max chunk length: {stats['min_chunk_length']}/{stats['max_chunk_length']}") | |
| print(f" πΎ Total characters: {stats['total_characters']:,}") | |
| print(f" β Chunks saved to: {output_file}") | |
| except Exception as e: | |
| logger.error(f"Pipeline failed: {e}") | |
| print(f"\nβ Error: {e}") | |
| return 1 | |
| return 0 | |
| if __name__ == "__main__": | |
| exit(main()) |