Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

AI_Toolkit / src /core /HierarchicalChunker.py

NavyDevilDoc

Upload 10 files

c0f31c1 verified 4 months ago

raw

history blame contribute delete

6.82 kB

	"""
	HierarchicalChunker.py

	A module for hierarchical document chunking that combines page-level and semantic chunking.

	Features:
	- Multi-level document representation (pages and chunks)
	- Semantic chunking with sentence boundaries
	- Size and overlap controls
	- Hierarchical metadata
	"""

	import logging
	import spacy
	from typing import Dict, List, Optional, Any
	from langchain_core.documents import Document
	from core.PageChunker import PageChunker

	logger = logging.getLogger(__name__)

	class HierarchicalChunker(PageChunker):
	"""Handles document chunking at multiple hierarchical levels."""

	def __init__(
	self,
	model_name: Optional[str] = None,
	embedding_model: Optional[Any] = None,
	chunk_size: int = 500,
	chunk_overlap: int = 50,
	similarity_threshold: float = 0.85
	):
	"""
	Initialize hierarchical chunker with specified models and parameters.

	Args:
	model_name: Name of the model for tokenization
	embedding_model: Model for generating embeddings
	chunk_size: Maximum size of semantic chunks
	chunk_overlap: Overlap between chunks
	similarity_threshold: Similarity threshold for merging chunks
	"""
	super().__init__(model_name, embedding_model)
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.similarity_threshold = similarity_threshold

	# Initialize spaCy for NLP tasks
	try:
	self.nlp = spacy.load("en_core_web_sm")
	except OSError:
	logger.info("Installing spaCy model...")
	import subprocess
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
	capture_output=True)
	self.nlp = spacy.load("en_core_web_sm")

	def _create_semantic_chunks(self, content: str, page_number: int) -> List[Document]:
	"""
	Create semantic chunks with detailed metadata.

	Args:
	content: The page content to chunk
	page_number: The page number

	Returns:
	List of Document objects representing semantic chunks
	"""
	if not content.strip():
	return []

	sentences = list(self.nlp(content).sents)
	chunks = []
	current_chunk = []
	current_length = 0

	for sent in sentences:
	sent_text = sent.text.strip()
	sent_length = len(sent_text)

	if current_length + sent_length > self.chunk_size:
	if current_chunk:
	chunk_text = " ".join(current_chunk)
	stats = self.analyze_text(chunk_text)
	chunks.append(Document(
	page_content=chunk_text,
	metadata={
	"level": "chunk",
	"page_num": page_number,
	"chunk_num": len(chunks) + 1,
	"parent_page": page_number,
	"char_count": stats["char_count"],
	"token_count": stats["token_count"],
	"sentence_count": stats["sentence_count"],
	"word_count": stats["word_count"],
	"has_ocr": stats.get("has_content", "true")
	}
	))
	current_chunk = [sent_text]
	current_length = sent_length
	else:
	current_chunk.append(sent_text)
	current_length += sent_length

	# Handle final chunk
	if current_chunk:
	chunk_text = " ".join(current_chunk)
	stats = self.analyze_text(chunk_text)
	chunks.append(Document(
	page_content=chunk_text,
	metadata={
	"level": "chunk",
	"page_num": page_number,
	"chunk_num": len(chunks) + 1,
	"parent_page": page_number,
	"char_count": stats["char_count"],
	"token_count": stats["token_count"],
	"sentence_count": stats["sentence_count"],
	"word_count": stats["word_count"],
	"has_ocr": stats.get("has_content", "true")
	}
	))

	self.page_stats.append(f"Created {len(chunks)} chunks for page {page_number}")
	return chunks

	def hierarchical_process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
	"""
	Process document with hierarchical chunking strategy.

	Args:
	file_path: Path to the PDF file
	preprocess: Whether to preprocess text

	Returns:
	Dictionary with 'pages' and 'chunks' lists of Documents
	"""
	self.page_stats = [] # Reset stats

	# First get the page-level documents using PageChunker
	page_docs = super().page_process_document(file_path, preprocess)

	# Now create chunk-level documents
	chunk_docs = []
	total_chunks = 0

	for page_doc in page_docs:
	page_num = page_doc.metadata["page"]

	# Mark this as a page-level document
	page_doc.metadata["level"] = "page"

	# Create chunks for this page
	page_chunks = self._create_semantic_chunks(
	page_doc.page_content,
	page_num
	)

	chunk_docs.extend(page_chunks)
	total_chunks += len(page_chunks)

	# Log summary information
	logger.info(f"\nHierarchical Processing Summary:")
	logger.info(f"Total Pages: {len(page_docs)}")
	logger.info(f"Total Chunks: {total_chunks}")
	logger.info("\n".join(self.page_stats))

	return {
	"pages": page_docs,
	"chunks": chunk_docs
	}

	def process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
	"""
	Process document using hierarchical chunking strategy (implements abstract method).

	Args:
	file_path: Path to the PDF file
	preprocess: Whether to preprocess text

	Returns:
	Dictionary with 'pages' and 'chunks' lists of Documents
	"""
	return self.hierarchical_process_document(file_path, preprocess)