Spaces:

fguryel
/

scikit-rag

Sleeping

App Files Files Community

scikit-rag / chunker.py

fguryel

init

9222df3 5 months ago

raw

history blame contribute delete

10.9 kB

	#!/usr/bin/env python3
	"""
	Text Chunker for Scikit-learn Documentation

	This module processes the scraped Scikit-learn documentation and chunks it
	into smaller, manageable pieces for use in a RAG application.

	Author: AI Assistant
	Date: September 2025
	"""

	import json
	import logging
	from typing import Dict, List, Any
	from pathlib import Path

	from langchain_text_splitters import RecursiveCharacterTextSplitter


	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	class DocumentChunker:
	"""
	A class for chunking scraped documentation into smaller pieces.

	This class handles the process of splitting long documents into
	manageable chunks while preserving metadata and context.
	"""

	def __init__(
	self,
	chunk_size: int = 1000,
	chunk_overlap: int = 150,
	separators: List[str] = None
	):
	"""
	Initialize the DocumentChunker.

	Args:
	chunk_size (int): Target size for each chunk in characters
	chunk_overlap (int): Number of characters to overlap between chunks
	separators (List[str]): Custom separators for text splitting
	"""
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap

	# Use custom separators or defaults optimized for documentation
	if separators is None:
	separators = [
	"\n\n", # Double newlines (paragraphs)
	"\n", # Single newlines
	". ", # Sentences
	"! ", # Exclamations
	"? ", # Questions
	"; ", # Semicolons
	", ", # Commas
	" ", # Spaces
	"" # Characters (last resort)
	]

	# Initialize the text splitter
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=separators,
	length_function=len,
	)

	logger.info(f"Initialized chunker with size={chunk_size}, overlap={chunk_overlap}")

	def load_scraped_content(self, filepath: str) -> List[Dict[str, str]]:
	"""
	Load scraped content from JSON file.

	Args:
	filepath (str): Path to the scraped content JSON file

	Returns:
	List[Dict[str, str]]: List of documents with 'url' and 'text' keys

	Raises:
	FileNotFoundError: If the file doesn't exist
	json.JSONDecodeError: If the file is not valid JSON
	"""
	filepath = Path(filepath)

	if not filepath.exists():
	raise FileNotFoundError(f"Scraped content file not found: {filepath}")

	logger.info(f"Loading scraped content from {filepath}")

	try:
	with open(filepath, 'r', encoding='utf-8') as f:
	content = json.load(f)

	logger.info(f"Loaded {len(content)} documents from {filepath}")
	return content

	except json.JSONDecodeError as e:
	logger.error(f"Invalid JSON in {filepath}: {e}")
	raise
	except Exception as e:
	logger.error(f"Error loading {filepath}: {e}")
	raise

	def create_chunk_metadata(self, original_url: str, chunk_index: int) -> Dict[str, Any]:
	"""
	Create metadata for a chunk.

	Args:
	original_url (str): URL of the original document
	chunk_index (int): Index of this chunk within the document

	Returns:
	Dict[str, Any]: Metadata dictionary
	"""
	return {
	"url": original_url,
	"chunk_index": chunk_index,
	"source": "scikit-learn-docs"
	}

	def chunk_document(self, document: Dict[str, str]) -> List[Dict[str, Any]]:
	"""
	Chunk a single document into smaller pieces.

	Args:
	document (Dict[str, str]): Document with 'url' and 'text' keys

	Returns:
	List[Dict[str, Any]]: List of chunks with 'page_content' and 'metadata' keys
	"""
	url = document['url']
	text = document['text']

	# Skip documents with minimal content
	if len(text.strip()) < 100:
	logger.warning(f"Skipping document with minimal content: {url}")
	return []

	logger.info(f"Chunking document: {url} ({len(text)} characters)")

	# Split the text into chunks
	text_chunks = self.text_splitter.split_text(text)

	# Create chunk objects with metadata
	chunks = []
	for i, chunk_text in enumerate(text_chunks):
	# Skip very small chunks
	if len(chunk_text.strip()) < 50:
	continue

	chunk = {
	"page_content": chunk_text.strip(),
	"metadata": self.create_chunk_metadata(url, i)
	}
	chunks.append(chunk)

	logger.info(f"Created {len(chunks)} chunks from {url}")
	return chunks

	def chunk_all_documents(self, documents: List[Dict[str, str]]) -> List[Dict[str, Any]]:
	"""
	Chunk all documents in the collection.

	Args:
	documents (List[Dict[str, str]]): List of documents to chunk

	Returns:
	List[Dict[str, Any]]: List of all chunks from all documents
	"""
	logger.info(f"Starting to chunk {len(documents)} documents")

	all_chunks = []
	total_chars_processed = 0

	for doc_index, document in enumerate(documents, 1):
	try:
	doc_chunks = self.chunk_document(document)
	all_chunks.extend(doc_chunks)

	# Track progress
	total_chars_processed += len(document['text'])

	if doc_index % 10 == 0:
	logger.info(f"Processed {doc_index}/{len(documents)} documents")

	except Exception as e:
	logger.error(f"Error chunking document {document.get('url', 'unknown')}: {e}")
	continue

	logger.info(f"Chunking completed:")
	logger.info(f" - Documents processed: {len(documents)}")
	logger.info(f" - Total chunks created: {len(all_chunks)}")
	logger.info(f" - Total characters processed: {total_chars_processed:,}")
	logger.info(f" - Average chunks per document: {len(all_chunks) / len(documents):.1f}")

	return all_chunks

	def save_chunks(self, chunks: List[Dict[str, Any]], filepath: str):
	"""
	Save chunks to a JSON file.

	Args:
	chunks (List[Dict[str, Any]]): List of chunks to save
	filepath (str): Output file path
	"""
	filepath = Path(filepath)

	logger.info(f"Saving {len(chunks)} chunks to {filepath}")

	try:
	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(chunks, f, indent=2, ensure_ascii=False)

	# Calculate file size
	file_size = filepath.stat().st_size / (1024 * 1024) # MB
	logger.info(f"Chunks saved successfully ({file_size:.2f} MB)")

	except Exception as e:
	logger.error(f"Error saving chunks to {filepath}: {e}")
	raise

	def get_chunk_statistics(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
	"""
	Calculate statistics about the chunks.

	Args:
	chunks (List[Dict[str, Any]]): List of chunks

	Returns:
	Dict[str, Any]: Statistics dictionary
	"""
	if not chunks:
	return {}

	chunk_lengths = [len(chunk['page_content']) for chunk in chunks]
	unique_urls = set(chunk['metadata']['url'] for chunk in chunks)

	stats = {
	"total_chunks": len(chunks),
	"unique_documents": len(unique_urls),
	"avg_chunk_length": sum(chunk_lengths) / len(chunk_lengths),
	"min_chunk_length": min(chunk_lengths),
	"max_chunk_length": max(chunk_lengths),
	"total_characters": sum(chunk_lengths)
	}

	return stats

	def process_and_save(
	self,
	input_filepath: str,
	output_filepath: str = "chunks.json"
	) -> Dict[str, Any]:
	"""
	Complete processing pipeline: load, chunk, and save.

	Args:
	input_filepath (str): Path to scraped content JSON
	output_filepath (str): Path to save chunks JSON

	Returns:
	Dict[str, Any]: Processing statistics
	"""
	logger.info("Starting document chunking pipeline")

	# Load scraped content
	documents = self.load_scraped_content(input_filepath)

	# Chunk all documents
	chunks = self.chunk_all_documents(documents)

	# Save chunks
	self.save_chunks(chunks, output_filepath)

	# Calculate and return statistics
	stats = self.get_chunk_statistics(chunks)

	logger.info("Chunking pipeline completed successfully")
	return stats


	def main():
	"""
	Main function to run the chunking process.
	"""
	print("Scikit-learn Documentation Chunker")
	print("=" * 50)

	# Configuration
	input_file = "scraped_content.json"
	output_file = "chunks.json"

	# Initialize chunker with optimal settings for documentation
	chunker = DocumentChunker(
	chunk_size=1000,
	chunk_overlap=150
	)

	try:
	# Process documents
	stats = chunker.process_and_save(input_file, output_file)

	# Display results
	print(f"\nProcessing Results:")
	print(f" 📄 Total chunks created: {stats['total_chunks']:,}")
	print(f" 📚 Unique documents: {stats['unique_documents']}")
	print(f" 📏 Average chunk length: {stats['avg_chunk_length']:.0f} characters")
	print(f" 📊 Min/Max chunk length: {stats['min_chunk_length']}/{stats['max_chunk_length']}")
	print(f" 💾 Total characters: {stats['total_characters']:,}")
	print(f" ✅ Chunks saved to: {output_file}")

	except Exception as e:
	logger.error(f"Pipeline failed: {e}")
	print(f"\n❌ Error: {e}")
	return 1

	return 0


	if __name__ == "__main__":
	exit(main())