Spaces:

sadickam
/

pythermalcomfort_Chat

Running

App Files Files Community

pythermalcomfort_Chat / src /rag_chatbot /extraction /__init__.py

sadickam

Initial commit for HF Space

3326079 3 months ago

raw

history blame contribute delete

5.74 kB

	"""PDF extraction module for the RAG pipeline.

	This module provides tools for extracting text and structure from PDF
	documents and converting them to well-structured Markdown format
	suitable for chunking and embedding.

	Components:
	Data Models:
	- ExtractedPage: Represents a single page with content, tables, images
	- ExtractedDocument: Represents a complete extracted PDF document

	Protocols:
	- Extractor: Synchronous protocol for PDF extraction implementations
	- AsyncExtractor: Asynchronous protocol for PDF extraction

	Implementations:
	- PDFExtractor: Main class for PDF document processing
	- MarkdownConverter: Converts extracted content to Markdown format

	Lazy Loading:
	Heavy dependencies (pymupdf4llm) are loaded on first access using
	__getattr__. This ensures fast import times when the extraction
	module is not immediately needed. Lightweight components (models,
	protocols) are also lazy-loaded for consistency.

	Example:
	-------
	>>> from rag_chatbot.extraction import PDFExtractor, ExtractedDocument
	>>> from pathlib import Path
	>>>
	>>> # Extract a PDF document
	>>> extractor = PDFExtractor()
	>>> doc = extractor.extract(Path("document.pdf"))
	>>> print(f"Extracted {doc.page_count} pages")
	>>>
	>>> # Access individual pages
	>>> for page in doc.pages:
	... print(f"Page {page.page_number}: {len(page.content)} chars")
	>>>
	>>> # Export to Markdown
	>>> markdown = doc.to_markdown()

	"""

	from __future__ import annotations

	from typing import TYPE_CHECKING

	# =============================================================================
	# Type Checking Imports
	# =============================================================================
	# These imports are only processed by type checkers (mypy, pyright) and IDEs.
	# They enable proper type hints and autocompletion without runtime overhead.
	# =============================================================================

	if TYPE_CHECKING:
	from rag_chatbot.extraction.base import AsyncExtractor, Extractor
	from rag_chatbot.extraction.markdown_converter import MarkdownConverter
	from rag_chatbot.extraction.models import ExtractedDocument, ExtractedPage
	from rag_chatbot.extraction.pdf_extractor import PDFExtractor

	# =============================================================================
	# Module Exports
	# =============================================================================
	# The __all__ list defines what is exported when using `from extraction import *`
	# All exports are lazy-loaded on first access for fast import times.
	# =============================================================================
	__all__: list[str] = [
	# Data Models
	"ExtractedPage",
	"ExtractedDocument",
	# Protocols
	"Extractor",
	"AsyncExtractor",
	# Implementations
	"PDFExtractor",
	"MarkdownConverter",
	]


	def __getattr__(name: str) -> object:
	"""Lazy load module exports on first access.

	This function is called when an attribute is not found in the module's
	namespace. It enables lazy loading of dependencies, ensuring fast import
	times when specific components are not immediately needed.

	The lazy loading strategy:
	1. Models (ExtractedPage, ExtractedDocument) - Lightweight but lazy for consistency
	2. Protocols (Extractor, AsyncExtractor) - Lightweight but lazy for consistency
	3. Implementations (PDFExtractor, MarkdownConverter) - Heavy deps, must be lazy

	Args:
	----
	name: The name of the attribute being accessed.

	Returns:
	-------
	The requested attribute if it exists in __all__.

	Raises:
	------
	AttributeError: If the attribute is not a valid export.

	Example:
	-------
	>>> from rag_chatbot.extraction import ExtractedDocument
	>>> # ExtractedDocument is loaded only when accessed
	>>> doc = ExtractedDocument(source_path="test.pdf", pages=[])

	"""
	# -------------------------------------------------------------------------
	# Data Models - Lightweight Pydantic models
	# -------------------------------------------------------------------------
	if name == "ExtractedPage":
	from rag_chatbot.extraction.models import ExtractedPage

	return ExtractedPage

	if name == "ExtractedDocument":
	from rag_chatbot.extraction.models import ExtractedDocument

	return ExtractedDocument

	# -------------------------------------------------------------------------
	# Protocols - Lightweight Protocol definitions
	# -------------------------------------------------------------------------
	if name == "Extractor":
	from rag_chatbot.extraction.base import Extractor

	return Extractor

	if name == "AsyncExtractor":
	from rag_chatbot.extraction.base import AsyncExtractor

	return AsyncExtractor

	# -------------------------------------------------------------------------
	# Implementations - May have heavy dependencies
	# -------------------------------------------------------------------------
	if name == "PDFExtractor":
	from rag_chatbot.extraction.pdf_extractor import PDFExtractor

	return PDFExtractor

	if name == "MarkdownConverter":
	from rag_chatbot.extraction.markdown_converter import MarkdownConverter

	return MarkdownConverter

	# -------------------------------------------------------------------------
	# Unknown attribute - raise AttributeError
	# -------------------------------------------------------------------------
	msg = f"module {__name__!r} has no attribute {name!r}"
	raise AttributeError(msg)