Spaces:
Sleeping
Sleeping
| """Heading inheritance parser for structure-aware markdown chunking. | |
| This module provides classes and functions for parsing markdown headings | |
| and associating content blocks with their heading context. It is a critical | |
| component of the RAG chunking pipeline that enables: | |
| - Hierarchical heading path tracking (H1 > H2 > H3 chains) | |
| - Context inheritance for content blocks | |
| - Proper handling of code blocks (avoiding false heading detection) | |
| - Graceful handling of edge cases (level skips, duplicate headings) | |
| The heading path is essential for providing context to the LLM during | |
| retrieval - knowing that content comes from "Chapter 1 > Methods > Setup" | |
| helps the model understand the content's role in the document structure. | |
| Models: | |
| - ParsedHeading: Pydantic model for a parsed heading with level and text | |
| - ContentBlock: Pydantic model for content with its heading context | |
| Classes: | |
| - HeadingParser: Main parser class for extracting headings and content | |
| Design Principles: | |
| - All models use Pydantic v2 for validation | |
| - Code blocks are properly detected and skipped | |
| - Text normalization is optional but integrated | |
| - Comprehensive inline documentation | |
| Example: | |
| ------- | |
| >>> from rag_chatbot.chunking.heading_parser import HeadingParser | |
| >>> parser = HeadingParser() | |
| >>> markdown = '''# Title | |
| ... | |
| ... Introduction text. | |
| ... | |
| ... ## Section | |
| ... | |
| ... Section content. | |
| ... ''' | |
| >>> headings = parser.parse_headings(markdown) | |
| >>> len(headings) | |
| 2 | |
| >>> blocks = parser.parse_content_blocks(markdown) | |
| >>> blocks[0].heading_path | |
| ['H1: Title'] | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from typing import TYPE_CHECKING | |
| from pydantic import ( | |
| BaseModel, | |
| ConfigDict, | |
| Field, | |
| field_validator, | |
| model_validator, | |
| ) | |
| # ============================================================================= | |
| # Type Checking Imports | |
| # ============================================================================= | |
| # These imports are only processed by type checkers (mypy, pyright) and IDEs. | |
| # They enable proper type hints without runtime overhead. | |
| # ============================================================================= | |
| if TYPE_CHECKING: | |
| from typing import Self | |
| from .models import TextNormalizer | |
| # ============================================================================= | |
| # Module Exports | |
| # ============================================================================= | |
| __all__: list[str] = [ | |
| "ParsedHeading", | |
| "ContentBlock", | |
| "HeadingParser", | |
| ] | |
| # ============================================================================= | |
| # Regex Patterns | |
| # ============================================================================= | |
| # Pre-compiled regex patterns for efficient parsing of markdown content. | |
| # These patterns are used throughout the module for heading detection, | |
| # code block identification, and content extraction. | |
| # ============================================================================= | |
| # Pattern for ATX-style headings (# H1, ## H2, etc.) | |
| # Breakdown: | |
| # ^ - Start of line | |
| # [ ]{0,3} - 0-3 leading spaces (allowed by CommonMark spec) | |
| # (#{1,6}) - 1-6 hash characters (captured as group 1 for level) | |
| # [ \t]+ - At least one space or tab (required after hashes) | |
| # (.+?) - The heading text (non-greedy, captured as group 2) | |
| # [ \t]* - Optional trailing whitespace | |
| # #* - Optional trailing hashes (ATX closing sequence) | |
| # [ \t]* - Optional trailing whitespace after closing hashes | |
| # $ - End of line | |
| _ATX_HEADING_PATTERN: re.Pattern[str] = re.compile( | |
| r"^[ ]{0,3}(#{1,6})[ \t]+(.+?)[ \t]*#*[ \t]*$", | |
| re.MULTILINE, | |
| ) | |
| # Pattern for fenced code block opening (``` or ~~~) | |
| # Breakdown: | |
| # ^ - Start of line | |
| # [ ]{0,3} - 0-3 leading spaces | |
| # (`{3,}|~{3,}) - 3+ backticks or tildes (group 1 captures the fence) | |
| # .* - Optional info string (language identifier) | |
| # $ - End of line | |
| _FENCED_CODE_START_PATTERN: re.Pattern[str] = re.compile( | |
| r"^[ ]{0,3}(`{3,}|~{3,}).*$", | |
| re.MULTILINE, | |
| ) | |
| # Pattern for indented code blocks (4+ spaces or tab) | |
| # Lines starting with 4+ spaces or a tab are code in CommonMark | |
| _INDENTED_CODE_PATTERN: re.Pattern[str] = re.compile( | |
| r"^(?:[ ]{4}|\t)", | |
| ) | |
| # Pattern for inline code spans (backtick sequences) | |
| # Used to detect inline code that should not be parsed as headings | |
| _INLINE_CODE_PATTERN: re.Pattern[str] = re.compile( | |
| r"`[^`]+`", | |
| ) | |
| # Pattern for paragraph boundaries (blank lines) | |
| # Two or more newlines indicate a paragraph break | |
| _PARAGRAPH_BREAK_PATTERN: re.Pattern[str] = re.compile( | |
| r"\n\s*\n", | |
| ) | |
| # ============================================================================= | |
| # Data Models | |
| # ============================================================================= | |
| class ParsedHeading(BaseModel): | |
| """Represents a single parsed heading from markdown content. | |
| This model captures the essential information about a heading: | |
| - The heading level (1-6 for H1-H6) | |
| - The heading text content | |
| - The line number where the heading appears | |
| The line number is 1-indexed to match typical editor conventions | |
| and user expectations. | |
| Attributes: | |
| ---------- | |
| level : int | |
| The heading level from 1 (H1) to 6 (H6). | |
| Validated to be within this range. | |
| text : str | |
| The text content of the heading, stripped of markdown syntax. | |
| Cannot be empty or whitespace-only. | |
| line_number : int | |
| The 1-indexed line number where this heading appears. | |
| Must be >= 1. | |
| Example: | |
| ------- | |
| >>> heading = ParsedHeading(level=2, text="Methods", line_number=15) | |
| >>> heading.level | |
| 2 | |
| >>> f"H{heading.level}: {heading.text}" | |
| 'H2: Methods' | |
| Note: | |
| ---- | |
| The text is automatically stripped of leading/trailing whitespace | |
| during validation. | |
| """ | |
| # ------------------------------------------------------------------------- | |
| # Model Configuration | |
| # ------------------------------------------------------------------------- | |
| model_config = ConfigDict( | |
| # Allow population by field name | |
| populate_by_name=True, | |
| # Validate default values | |
| validate_default=True, | |
| # Forbid extra fields to catch typos | |
| extra="forbid", | |
| # JSON schema examples for documentation | |
| json_schema_extra={ | |
| "examples": [ | |
| {"level": 1, "text": "Introduction", "line_number": 1}, | |
| {"level": 2, "text": "Background", "line_number": 5}, | |
| {"level": 3, "text": "Methods Overview", "line_number": 12}, | |
| ] | |
| }, | |
| ) | |
| # ------------------------------------------------------------------------- | |
| # Fields | |
| # ------------------------------------------------------------------------- | |
| level: int = Field( | |
| ..., # Required field | |
| ge=1, # Minimum heading level is 1 (H1) | |
| le=6, # Maximum heading level is 6 (H6) | |
| description="Heading level from 1 (H1) to 6 (H6)", | |
| examples=[1, 2, 3, 4, 5, 6], | |
| ) | |
| text: str = Field( | |
| ..., # Required field | |
| min_length=1, # Cannot be empty | |
| description="The text content of the heading", | |
| examples=["Introduction", "Methods", "Results and Discussion"], | |
| ) | |
| line_number: int = Field( | |
| ..., # Required field | |
| ge=1, # Line numbers are 1-indexed | |
| description="The 1-indexed line number where the heading appears", | |
| examples=[1, 10, 42], | |
| ) | |
| # ------------------------------------------------------------------------- | |
| # Validators | |
| # ------------------------------------------------------------------------- | |
| def _strip_and_validate_text(cls, value: object) -> str: | |
| """Strip whitespace and validate that text is not empty. | |
| This validator ensures that heading text is clean and meaningful. | |
| It strips leading/trailing whitespace and rejects empty strings. | |
| Args: | |
| ---- | |
| value: The input value to validate. | |
| Returns: | |
| ------- | |
| The stripped text string. | |
| Raises: | |
| ------ | |
| ValueError: If text is None, empty, or whitespace-only. | |
| """ | |
| if value is None: | |
| msg = "text cannot be None" | |
| raise ValueError(msg) | |
| # Convert to string and strip whitespace | |
| text = str(value).strip() | |
| # Reject empty or whitespace-only text | |
| if not text: | |
| msg = "text cannot be empty or whitespace-only" | |
| raise ValueError(msg) | |
| return text | |
| class ContentBlock(BaseModel): | |
| """Represents a block of content with its heading context. | |
| A content block is a contiguous section of text (typically a paragraph | |
| or related paragraphs) that exists under a specific heading hierarchy. | |
| The heading_path provides context about where this content lives in | |
| the document structure. | |
| For example, content under "# Chapter 1 > ## Methods > ### Setup" | |
| would have heading_path = ["H1: Chapter 1", "H2: Methods", "H3: Setup"] | |
| This context is valuable during retrieval as it helps the LLM | |
| understand the content's role and relationship to other sections. | |
| Attributes: | |
| ---------- | |
| content : str | |
| The actual text content. Cannot be empty or whitespace-only. | |
| heading_path : list[str] | |
| Hierarchical list of headings providing context. | |
| Format: ["H1: Title", "H2: Section", "H3: Subsection"] | |
| Empty list for content before any heading. | |
| start_line : int | |
| The 1-indexed line number where this content starts. | |
| end_line : int | |
| The 1-indexed line number where this content ends. | |
| Must be >= start_line. | |
| Example: | |
| ------- | |
| >>> block = ContentBlock( | |
| ... content="The PMV model predicts thermal sensation.", | |
| ... heading_path=["H1: Thermal Comfort", "H2: PMV Model"], | |
| ... start_line=10, | |
| ... end_line=12, | |
| ... ) | |
| >>> block.heading_path | |
| ['H1: Thermal Comfort', 'H2: PMV Model'] | |
| Note: | |
| ---- | |
| Content is automatically stripped during validation. | |
| The heading_path order is preserved exactly as provided. | |
| """ | |
| # ------------------------------------------------------------------------- | |
| # Model Configuration | |
| # ------------------------------------------------------------------------- | |
| model_config = ConfigDict( | |
| populate_by_name=True, | |
| validate_default=True, | |
| extra="forbid", | |
| json_schema_extra={ | |
| "examples": [ | |
| { | |
| "content": "The PMV model predicts thermal sensation.", | |
| "heading_path": ["H1: Thermal Comfort", "H2: PMV Model"], | |
| "start_line": 10, | |
| "end_line": 12, | |
| }, | |
| { | |
| "content": "Orphan content before any heading.", | |
| "heading_path": [], | |
| "start_line": 1, | |
| "end_line": 3, | |
| }, | |
| ] | |
| }, | |
| ) | |
| # ------------------------------------------------------------------------- | |
| # Fields | |
| # ------------------------------------------------------------------------- | |
| content: str = Field( | |
| ..., # Required field | |
| min_length=1, # Cannot be empty | |
| description="The text content of the block", | |
| ) | |
| heading_path: list[str] = Field( | |
| default_factory=list, | |
| description="Hierarchical list of headings providing context", | |
| ) | |
| start_line: int = Field( | |
| ..., # Required field | |
| ge=1, # Line numbers are 1-indexed | |
| description="The 1-indexed line number where content starts", | |
| ) | |
| end_line: int = Field( | |
| ..., # Required field | |
| ge=1, # Line numbers are 1-indexed | |
| description="The 1-indexed line number where content ends", | |
| ) | |
| # ------------------------------------------------------------------------- | |
| # Validators | |
| # ------------------------------------------------------------------------- | |
| def _strip_and_validate_content(cls, value: object) -> str: | |
| """Strip whitespace and validate that content is not empty. | |
| This validator ensures that content blocks have meaningful text. | |
| Empty blocks should not be created. | |
| Args: | |
| ---- | |
| value: The input value to validate. | |
| Returns: | |
| ------- | |
| The stripped content string. | |
| Raises: | |
| ------ | |
| ValueError: If content is None, empty, or whitespace-only. | |
| """ | |
| if value is None: | |
| msg = "content cannot be None" | |
| raise ValueError(msg) | |
| # Convert to string and strip whitespace | |
| content = str(value).strip() | |
| # Reject empty or whitespace-only content | |
| if not content: | |
| msg = "content cannot be empty or whitespace-only" | |
| raise ValueError(msg) | |
| return content | |
| def _validate_line_range(self) -> Self: | |
| """Validate that end_line is >= start_line. | |
| Content blocks can span multiple lines, but end_line must be | |
| at least equal to start_line (for single-line content). | |
| Returns | |
| ------- | |
| The validated model instance. | |
| Raises | |
| ------ | |
| ValueError: If end_line is less than start_line. | |
| """ | |
| if self.end_line < self.start_line: | |
| msg = ( | |
| f"end_line ({self.end_line}) must be >= " | |
| f"start_line ({self.start_line})" | |
| ) | |
| raise ValueError(msg) | |
| return self | |
| # ============================================================================= | |
| # HeadingParser Class | |
| # ============================================================================= | |
| class HeadingParser: | |
| """Parser for extracting headings and content blocks from markdown. | |
| This class provides methods for: | |
| 1. Parsing ATX-style headings (# H1, ## H2, etc.) from markdown | |
| 2. Building hierarchical heading paths for any line in the document | |
| 3. Extracting content blocks with their heading context | |
| The parser correctly handles: | |
| - Fenced code blocks (```) - hash symbols inside are not headings | |
| - Indented code blocks (4 spaces) - not parsed as headings | |
| - Inline code (`code`) - hash symbols inside are not headings | |
| - Trailing hash sequences (# Title ##) | |
| - Leading spaces (up to 3, per CommonMark spec) | |
| - Heading level skips (H1 -> H3) | |
| - Duplicate heading texts | |
| - Special characters and unicode in headings | |
| The optional TextNormalizer can be used to clean up OCR artifacts | |
| and fix jumbled words in heading text. | |
| Attributes: | |
| ---------- | |
| normalizer : TextNormalizer | None | |
| Optional text normalizer for cleaning heading text. | |
| If None, text is used as-is (after stripping). | |
| Example: | |
| ------- | |
| >>> parser = HeadingParser() | |
| >>> markdown = '''# Introduction | |
| ... | |
| ... First paragraph. | |
| ... | |
| ... ## Methods | |
| ... | |
| ... Methods content. | |
| ... ''' | |
| >>> headings = parser.parse_headings(markdown) | |
| >>> [h.text for h in headings] | |
| ['Introduction', 'Methods'] | |
| >>> blocks = parser.parse_content_blocks(markdown) | |
| >>> len(blocks) | |
| 2 | |
| Note: | |
| ---- | |
| The parser assumes ATX-style headings only. Setext-style headings | |
| (underlined with = or -) are not currently supported. | |
| """ | |
| def __init__(self, normalizer: TextNormalizer | None = None) -> None: | |
| """Initialize the HeadingParser with an optional text normalizer. | |
| Args: | |
| ---- | |
| normalizer: Optional TextNormalizer instance for cleaning | |
| heading text. If None, headings are used as-is. | |
| Example: | |
| ------- | |
| >>> parser = HeadingParser() # No normalization | |
| >>> from rag_chatbot.chunking.models import TextNormalizer | |
| >>> normalizer = TextNormalizer() | |
| >>> parser_with_norm = HeadingParser(normalizer=normalizer) | |
| """ | |
| # Store the normalizer for use in parsing methods | |
| # If None, heading text will be used without normalization | |
| self._normalizer = normalizer | |
| def _find_code_block_ranges(self, markdown: str) -> list[tuple[int, int]]: | |
| """Find all fenced code block line ranges in the markdown. | |
| This method identifies fenced code blocks (using ``` or ~~~) and | |
| returns the line ranges that should be excluded from heading parsing. | |
| This prevents false positive heading detection inside code blocks. | |
| The method correctly handles: | |
| - Backtick fences (```) | |
| - Tilde fences (~~~) | |
| - Language info strings (```python) | |
| - Nested fence characters of different types | |
| - Unclosed code blocks at end of document | |
| Args: | |
| ---- | |
| markdown: The markdown content to analyze. | |
| Returns: | |
| ------- | |
| List of (start_line, end_line) tuples representing code block | |
| ranges (1-indexed, inclusive). | |
| Note: | |
| ---- | |
| The ranges are inclusive on both ends. A code block from | |
| line 5 to line 10 would be returned as (5, 10). | |
| """ | |
| ranges: list[tuple[int, int]] = [] | |
| lines = markdown.split("\n") | |
| # State tracking for code block detection | |
| in_code_block = False | |
| code_block_start = 0 | |
| fence_char = "" # Track which fence character started the block | |
| fence_length = 0 # Track the length of the opening fence | |
| for i, line in enumerate(lines): | |
| line_number = i + 1 # Convert to 1-indexed | |
| # Check for fenced code block delimiter | |
| fence_match = _FENCED_CODE_START_PATTERN.match(line) | |
| if fence_match: | |
| fence = fence_match.group(1) | |
| current_fence_char = fence[0] | |
| current_fence_length = len(fence) | |
| if not in_code_block: | |
| # Starting a new code block | |
| in_code_block = True | |
| code_block_start = line_number | |
| fence_char = current_fence_char | |
| fence_length = current_fence_length | |
| elif ( | |
| current_fence_char == fence_char | |
| and current_fence_length >= fence_length | |
| ): | |
| # Closing fence must match the opening type and be at least as long | |
| ranges.append((code_block_start, line_number)) | |
| in_code_block = False | |
| fence_char = "" | |
| fence_length = 0 | |
| # Handle unclosed code block at end of document | |
| if in_code_block: | |
| ranges.append((code_block_start, len(lines))) | |
| return ranges | |
| def _is_line_in_code_block( | |
| self, | |
| line_number: int, | |
| code_ranges: list[tuple[int, int]], | |
| ) -> bool: | |
| """Check if a given line number is inside a code block. | |
| This helper method efficiently checks whether a line falls within | |
| any of the code block ranges identified by _find_code_block_ranges. | |
| Args: | |
| ---- | |
| line_number: The 1-indexed line number to check. | |
| code_ranges: List of (start, end) tuples from _find_code_block_ranges. | |
| Returns: | |
| ------- | |
| True if the line is inside a code block, False otherwise. | |
| """ | |
| return any(start <= line_number <= end for start, end in code_ranges) | |
| def _is_indented_code(self, line: str) -> bool: | |
| """Check if a line is indented code (4+ spaces or tab). | |
| In CommonMark markdown, lines starting with 4 or more spaces | |
| or a tab are treated as indented code blocks. | |
| Args: | |
| ---- | |
| line: The line to check. | |
| Returns: | |
| ------- | |
| True if the line is indented code, False otherwise. | |
| """ | |
| return bool(_INDENTED_CODE_PATTERN.match(line)) | |
| def parse_headings(self, markdown: str) -> list[ParsedHeading]: | |
| r"""Parse all ATX-style headings from markdown content. | |
| This method extracts all valid headings from the markdown, properly | |
| handling code blocks and edge cases. The returned headings are in | |
| document order (first heading first). | |
| Valid headings must: | |
| - Start with 1-6 hash characters | |
| - Have at least one space after the hashes | |
| - Have non-empty text after the space | |
| - Not be inside a code block | |
| Args: | |
| ---- | |
| markdown: The markdown content to parse. | |
| Returns: | |
| ------- | |
| List of ParsedHeading objects in document order. | |
| Empty list if no headings are found. | |
| Example: | |
| ------- | |
| >>> parser = HeadingParser() | |
| >>> md = "# Title\n\n## Section\n\nContent." | |
| >>> headings = parser.parse_headings(md) | |
| >>> [(h.level, h.text) for h in headings] | |
| [(1, 'Title'), (2, 'Section')] | |
| Note: | |
| ---- | |
| This method handles Windows line endings (CRLF) correctly. | |
| """ | |
| # Handle empty input | |
| if not markdown or not markdown.strip(): | |
| return [] | |
| # Normalize line endings for consistent processing | |
| markdown = markdown.replace("\r\n", "\n").replace("\r", "\n") | |
| # Find all code block ranges to exclude from heading parsing | |
| code_ranges = self._find_code_block_ranges(markdown) | |
| # Split into lines for line-by-line processing | |
| lines = markdown.split("\n") | |
| headings: list[ParsedHeading] = [] | |
| for i, line in enumerate(lines): | |
| line_number = i + 1 # Convert to 1-indexed | |
| # Skip lines inside code blocks | |
| if self._is_line_in_code_block(line_number, code_ranges): | |
| continue | |
| # Skip indented code lines (4+ spaces or tab) | |
| if self._is_indented_code(line): | |
| continue | |
| # Try to match ATX heading pattern | |
| match = _ATX_HEADING_PATTERN.match(line) | |
| if match: | |
| # Extract the hash sequence (group 1) and text (group 2) | |
| hashes = match.group(1) | |
| text = match.group(2).strip() | |
| # Validate heading level (should be 1-6 based on pattern) | |
| level = len(hashes) | |
| # Skip headings with no text content | |
| if not text: | |
| continue | |
| # Apply text normalization if normalizer is available | |
| if self._normalizer is not None: | |
| # Normalize extra spaces in the heading text | |
| text = self._normalizer.normalize_whitespace(text) | |
| # Apply jumbled word fixes | |
| text = self._normalizer.normalize_jumbled_words(text) | |
| # Create and append the parsed heading | |
| heading = ParsedHeading( | |
| level=level, | |
| text=text, | |
| line_number=line_number, | |
| ) | |
| headings.append(heading) | |
| return headings | |
| def build_heading_path( | |
| self, | |
| headings: list[ParsedHeading], | |
| line: int, | |
| ) -> list[str]: | |
| """Build the heading path for a given line number. | |
| This method constructs the hierarchical heading path that provides | |
| context for content at a specific line. The path shows the "breadcrumb" | |
| trail through the document structure. | |
| The heading path includes all ancestor headings that are in scope | |
| at the given line. When a new heading at the same or higher level | |
| is encountered, it replaces the previous heading at that level | |
| and clears all deeper levels. | |
| Args: | |
| ---- | |
| headings: List of ParsedHeading objects (must be in document order). | |
| line: The 1-indexed line number to build the path for. | |
| Returns: | |
| ------- | |
| List of heading strings in format ["H1: Title", "H2: Section"]. | |
| Empty list if the line is before all headings or no headings exist. | |
| Example: | |
| ------- | |
| >>> parser = HeadingParser() | |
| >>> headings = [ | |
| ... ParsedHeading(level=1, text="Title", line_number=1), | |
| ... ParsedHeading(level=2, text="Section", line_number=5), | |
| ... ] | |
| >>> parser.build_heading_path(headings, 7) | |
| ['H1: Title', 'H2: Section'] | |
| >>> parser.build_heading_path(headings, 3) # Before H2 | |
| ['H1: Title'] | |
| Note: | |
| ---- | |
| This method handles heading level skips gracefully. If the document | |
| goes H1 -> H3 (skipping H2), the path will show both H1 and H3. | |
| """ | |
| # Handle empty headings list | |
| if not headings: | |
| return [] | |
| # Use a dictionary to track headings at each level | |
| # Key: level (1-6), Value: heading text | |
| # This naturally handles the "stack" behavior where deeper levels | |
| # are cleared when a shallower heading appears | |
| heading_stack: dict[int, str] = {} | |
| # Process headings in order up to the current line | |
| for heading in headings: | |
| # Only consider headings at or before the target line | |
| if heading.line_number > line: | |
| break | |
| # Clear all levels deeper than the current heading | |
| # This handles the case where H2 appears after H3 -> clears H3+ | |
| levels_to_clear = [ | |
| level for level in heading_stack if level >= heading.level | |
| ] | |
| for level in levels_to_clear: | |
| del heading_stack[level] | |
| # Add the current heading to the stack | |
| heading_stack[heading.level] = heading.text | |
| # Build the path from the stack, sorted by level | |
| path: list[str] = [] | |
| for level in sorted(heading_stack.keys()): | |
| path.append(f"H{level}: {heading_stack[level]}") | |
| return path | |
| def _extract_content_segments( | |
| self, | |
| markdown: str, | |
| ) -> list[tuple[str, int, int]]: | |
| """Extract non-heading content segments from markdown. | |
| This helper method identifies all content sections that are not | |
| headings, code block delimiters, or empty lines. It returns | |
| tuples of (content, start_line, end_line) for each segment. | |
| Content segments are separated by: | |
| - Blank lines (paragraph boundaries) | |
| - Headings | |
| - Code block delimiters | |
| Args: | |
| ---- | |
| markdown: The markdown content to analyze. | |
| Returns: | |
| ------- | |
| List of (content, start_line, end_line) tuples. | |
| Lines are 1-indexed and inclusive. | |
| """ | |
| # Handle empty input | |
| if not markdown or not markdown.strip(): | |
| return [] | |
| # Normalize line endings | |
| markdown = markdown.replace("\r\n", "\n").replace("\r", "\n") | |
| # Find code block ranges | |
| code_ranges = self._find_code_block_ranges(markdown) | |
| lines = markdown.split("\n") | |
| segments: list[tuple[str, int, int]] = [] | |
| # Track current segment being built | |
| current_content_lines: list[str] = [] | |
| current_start: int | None = None | |
| def flush_segment() -> None: | |
| """Flush the current segment to the segments list.""" | |
| nonlocal current_content_lines, current_start | |
| if current_content_lines and current_start is not None: | |
| content = "\n".join(current_content_lines).strip() | |
| if content: # Only add non-empty content | |
| end_line = current_start + len(current_content_lines) - 1 | |
| segments.append((content, current_start, end_line)) | |
| current_content_lines = [] | |
| current_start = None | |
| for i, line in enumerate(lines): | |
| line_number = i + 1 | |
| # Check if this line is inside a code block | |
| in_code = self._is_line_in_code_block(line_number, code_ranges) | |
| # Check if this line is a heading (outside code blocks) | |
| is_heading = False | |
| if not in_code and not self._is_indented_code(line): | |
| is_heading = bool(_ATX_HEADING_PATTERN.match(line)) | |
| # Check if this line is a code fence delimiter | |
| is_fence = bool(_FENCED_CODE_START_PATTERN.match(line)) | |
| if is_heading: | |
| # Flush current segment before heading | |
| flush_segment() | |
| elif not line.strip(): | |
| # Blank line - flush segment (paragraph boundary) | |
| flush_segment() | |
| elif is_fence and not in_code: | |
| # Code fence start - begin including code block content | |
| flush_segment() | |
| current_start = line_number | |
| current_content_lines = [line] | |
| elif in_code: | |
| # Inside code block - include the content | |
| if current_start is None: | |
| current_start = line_number | |
| current_content_lines.append(line) | |
| else: | |
| # Regular content line | |
| if current_start is None: | |
| current_start = line_number | |
| current_content_lines.append(line) | |
| # Flush any remaining segment | |
| flush_segment() | |
| return segments | |
| def parse_content_blocks(self, markdown: str) -> list[ContentBlock]: | |
| """Parse content blocks with their heading context from markdown. | |
| This method extracts all content blocks (paragraphs, code blocks, etc.) | |
| and associates each with its heading context. The heading path for | |
| each block represents the document structure leading to that content. | |
| Content blocks are created for: | |
| - Paragraphs of text | |
| - Code blocks (fenced or indented) | |
| - Lists and other block elements | |
| Headings themselves are NOT included as content blocks - only the | |
| content under them. | |
| Args: | |
| ---- | |
| markdown: The markdown content to parse. | |
| Returns: | |
| ------- | |
| List of ContentBlock objects with heading context. | |
| Empty list if no content is found. | |
| Example: | |
| ------- | |
| >>> parser = HeadingParser() | |
| >>> md = '''# Title | |
| ... | |
| ... First paragraph under title. | |
| ... | |
| ... ## Section | |
| ... | |
| ... Content under section. | |
| ... ''' | |
| >>> blocks = parser.parse_content_blocks(md) | |
| >>> blocks[0].heading_path | |
| ['H1: Title'] | |
| >>> blocks[1].heading_path | |
| ['H1: Title', 'H2: Section'] | |
| Note: | |
| ---- | |
| Empty documents or documents with only headings return empty list. | |
| """ | |
| # Handle empty input | |
| if not markdown or not markdown.strip(): | |
| return [] | |
| # Parse all headings first (needed for building heading paths) | |
| headings = self.parse_headings(markdown) | |
| # Extract content segments | |
| segments = self._extract_content_segments(markdown) | |
| # Build content blocks with heading context | |
| blocks: list[ContentBlock] = [] | |
| for content, start_line, end_line in segments: | |
| # Skip empty content (should not happen but be defensive) | |
| if not content.strip(): | |
| continue | |
| # Build heading path for this content's start line | |
| heading_path = self.build_heading_path(headings, start_line) | |
| # Create the content block | |
| try: | |
| block = ContentBlock( | |
| content=content, | |
| heading_path=heading_path, | |
| start_line=start_line, | |
| end_line=end_line, | |
| ) | |
| blocks.append(block) | |
| except ValueError: | |
| # Skip invalid blocks (e.g., whitespace-only content) | |
| continue | |
| return blocks | |