Spaces:

sadickam
/

pythermalcomfort_Chat

Sleeping

App Files Files Community

pythermalcomfort_Chat / src /rag_chatbot /chunking /__init__.py

sadickam

Initial commit for HF Space

3326079 about 2 months ago

raw

history blame contribute delete

6.27 kB

	"""Structure-aware chunking module for the RAG pipeline.

	This module provides tools for splitting Markdown documents into
	semantically meaningful chunks suitable for embedding and retrieval.
	The chunking process preserves document structure by:
	- Respecting heading boundaries
	- Inheriting parent headings for context
	- Maintaining code block integrity
	- Keeping related content together
	- Attaching lists to preceding paragraphs
	- Converting tables to natural language descriptions

	Components:
	- Chunk: Pydantic model representing a document chunk with metadata
	- ChunkingConfig: Configuration parameters for chunking
	- TextNormalizer: Text normalization utilities for PDF extraction artifacts
	- HeadingParser: Parser for extracting headings and content blocks
	- ListAttacher: Attaches lists to preceding paragraph context
	- TableConverter: Converts markdown tables to natural language
	- ContentProcessor: Unified processor for lists and tables
	- Chunker: Main orchestrator class for document chunking pipeline
	- ChunkingStrategy: Abstract base class for chunking strategies
	- MarkdownStrategy: Strategy for general Markdown documents
	- CodeStrategy: Strategy for code-heavy content
	- StructureAwareStrategy: Strategy combining heading, list, and table handling

	Lazy Loading:
	Heavy dependencies are loaded on first access using __getattr__.
	This ensures fast import times when the chunking module is not
	immediately needed.

	Example:
	-------
	>>> from rag_chatbot.chunking import ChunkingConfig, Chunker, TextNormalizer
	>>> config = ChunkingConfig(min_tokens=450, max_tokens=700)
	>>> normalizer = TextNormalizer()
	>>> chunker = Chunker(config, normalizer=normalizer)
	>>> chunks = chunker.chunk_document(markdown_content, source="doc.pdf")

	"""

	from __future__ import annotations

	from typing import TYPE_CHECKING

	if TYPE_CHECKING:
	from .chunker import Chunker
	from .heading_parser import ContentBlock, HeadingParser, ParsedHeading
	from .list_table_attacher import (
	AttachedContent,
	ContentProcessor,
	ListAttacher,
	ListDetector,
	ParsedTable,
	TableConverter,
	TableDetector,
	)
	from .models import (
	THERMAL_COMFORT_TERMS,
	Chunk,
	ChunkingConfig,
	TextNormalizer,
	)
	from .strategies import (
	ChunkingStrategy,
	CodeStrategy,
	MarkdownStrategy,
	StructureAwareStrategy,
	)
	from .token_aware import SlidingWindowChunker, Tokenizer

	# =============================================================================
	# Module Exports
	# =============================================================================
	__all__: list[str] = [
	# Data models (Step 3.1)
	"Chunk",
	"ChunkingConfig",
	"TextNormalizer",
	"THERMAL_COMFORT_TERMS",
	# Heading parser (Step 3.2)
	"ParsedHeading",
	"ContentBlock",
	"HeadingParser",
	# List and table attachment (Step 3.3)
	"AttachedContent",
	"ListDetector",
	"ListAttacher",
	"ParsedTable",
	"TableDetector",
	"TableConverter",
	"ContentProcessor",
	# Chunker pipeline (Step 3.5)
	"Chunker",
	"ChunkingStrategy",
	"MarkdownStrategy",
	"CodeStrategy",
	"StructureAwareStrategy",
	# Token-aware chunking (Step 3.4)
	"Tokenizer",
	"SlidingWindowChunker",
	]


	# =============================================================================
	# Lazy Import Registry
	# =============================================================================
	# Maps exported names to their (module, attribute) tuples for lazy loading.
	# This pattern avoids loading heavy dependencies until they are actually used.
	# =============================================================================
	_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
	# Data models (Step 3.1)
	"Chunk": (".models", "Chunk"),
	"ChunkingConfig": (".models", "ChunkingConfig"),
	"TextNormalizer": (".models", "TextNormalizer"),
	"THERMAL_COMFORT_TERMS": (".models", "THERMAL_COMFORT_TERMS"),
	# Heading parser (Step 3.2)
	"ParsedHeading": (".heading_parser", "ParsedHeading"),
	"ContentBlock": (".heading_parser", "ContentBlock"),
	"HeadingParser": (".heading_parser", "HeadingParser"),
	# List and table attachment (Step 3.3)
	"AttachedContent": (".list_table_attacher", "AttachedContent"),
	"ListDetector": (".list_table_attacher", "ListDetector"),
	"ListAttacher": (".list_table_attacher", "ListAttacher"),
	"ParsedTable": (".list_table_attacher", "ParsedTable"),
	"TableDetector": (".list_table_attacher", "TableDetector"),
	"TableConverter": (".list_table_attacher", "TableConverter"),
	"ContentProcessor": (".list_table_attacher", "ContentProcessor"),
	# Chunker pipeline (Step 3.5)
	"Chunker": (".chunker", "Chunker"),
	"ChunkingStrategy": (".strategies", "ChunkingStrategy"),
	"MarkdownStrategy": (".strategies", "MarkdownStrategy"),
	"CodeStrategy": (".strategies", "CodeStrategy"),
	"StructureAwareStrategy": (".strategies", "StructureAwareStrategy"),
	# Token-aware chunking (Step 3.4)
	"Tokenizer": (".token_aware", "Tokenizer"),
	"SlidingWindowChunker": (".token_aware", "SlidingWindowChunker"),
	}


	def __getattr__(name: str) -> object:
	"""Lazy load module exports on first access.

	This function is called when an attribute is not found in the module's
	namespace. It enables lazy loading of heavy dependencies using a
	registry-based lookup pattern.

	Args:
	----
	name: The name of the attribute being accessed.

	Returns:
	-------
	The requested attribute if it exists in the lazy import registry.

	Raises:
	------
	AttributeError: If the attribute is not a valid export.

	"""
	if name in _LAZY_IMPORTS:
	module_path, attr_name = _LAZY_IMPORTS[name]
	# Use __import__ with proper package context for relative imports
	import importlib

	module = importlib.import_module(module_path, package=__name__)
	return getattr(module, attr_name)

	msg = f"module {__name__!r} has no attribute {name!r}" # pragma: no cover
	raise AttributeError(msg) # pragma: no cover