Spaces:

vROMjs
/

vrom-hub

Sleeping

App Files Files Community

philipp-zettl commited on Apr 25

Commit

cc18cf3

verified ·

1 Parent(s): f7b8b86

Add vrom_hub/chunker.py

Browse files

Files changed (1) hide show

vrom_hub/chunker.py +307 -0

vrom_hub/chunker.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+Section-aware document chunker for vROM.
+Splits markdown documents into ~256-token chunks that:
+- Respect section boundaries (heading-aware)
+- Preserve code blocks intact
+- Create a doubly-linked list (prev_chunk_id / next_chunk_id)
+- Track source file, heading, char offsets, and URL
+"""
+from __future__ import annotations
+import hashlib
+import re
+from dataclasses import dataclass, field, asdict
+from typing import Optional
+@dataclass
+class Chunk:
+    """A single chunk of documentation text with full metadata."""
+    chunk_id: int
+    text: str
+    source_file: str
+    section_heading: str
+    char_start: int
+    char_end: int
+    token_estimate: int
+    prev_chunk_id: Optional[int]
+    next_chunk_id: Optional[int]
+    url: str
+    doc_title: str
+    def to_dict(self) -> dict:
+        return asdict(self)
+@dataclass
+class DocPage:
+    """A single documentation page to be chunked."""
+    content: str
+    source_file: str  # e.g. "trl/index.md"
+    url: str          # canonical URL
+    title: str        # document title
+def _estimate_tokens(text: str) -> int:
+    """Rough token count: ~4 chars per token for English text."""
+    return max(1, len(text) // 4)
+def _split_preserving_code_blocks(text: str) -> list[dict]:
+    """
+    Split text into segments, marking which are code blocks.
+    Returns list of {"text": str, "is_code": bool}.
+    """
+    segments = []
+    pattern = re.compile(r'(```[\s\S]*?```)', re.MULTILINE)
+    last_end = 0
+    for match in pattern.finditer(text):
+        # Text before code block
+        before = text[last_end:match.start()]
+        if before.strip():
+            segments.append({"text": before, "is_code": False})
+        # The code block itself
+        segments.append({"text": match.group(0), "is_code": True})
+        last_end = match.end()
+    # Remaining text after last code block
+    remaining = text[last_end:]
+    if remaining.strip():
+        segments.append({"text": remaining, "is_code": False})
+    return segments
+def _split_into_sections(content: str) -> list[dict]:
+    """
+    Split markdown content by headings.
+    Returns list of {"heading": str, "text": str, "char_start": int, "char_end": int}.
+    """
+    # Match markdown headings (# ## ### etc.)
+    heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
+    sections = []
+    matches = list(heading_pattern.finditer(content))
+    if not matches:
+        # No headings found — entire content is one section
+        return [{
+            "heading": "",
+            "text": content,
+            "char_start": 0,
+            "char_end": len(content),
+        }]
+    # Text before first heading
+    if matches[0].start() > 0:
+        pre_text = content[:matches[0].start()]
+        if pre_text.strip():
+            sections.append({
+                "heading": "",
+                "text": pre_text,
+                "char_start": 0,
+                "char_end": matches[0].start(),
+            })
+    for i, match in enumerate(matches):
+        heading_text = match.group(2).strip()
+        start = match.start()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
+        section_text = content[start:end]
+        sections.append({
+            "heading": heading_text,
+            "text": section_text,
+            "char_start": start,
+            "char_end": end,
+        })
+    return sections
+class SectionAwareChunker:
+    """
+    Chunks documentation pages into ~max_tokens-token pieces.
+    Strategy:
+    1. Split by markdown headings → sections
+    2. For each section, split into chunks of ≤ max_tokens tokens
+    3. Preserve code blocks: never split in the middle of a fenced code block
+    4. Create doubly-linked list pointers between chunks of the same document
+    """
+    def __init__(self, max_tokens: int = 256):
+        self.max_tokens = max_tokens
+    def chunk_page(self, page: DocPage, start_chunk_id: int = 0) -> list[Chunk]:
+        """
+        Chunk a single documentation page.
+        Args:
+            page: The document page to chunk.
+            start_chunk_id: The starting chunk_id (for multi-page builds).
+        Returns:
+            List of Chunk objects with sequential IDs starting at start_chunk_id.
+        """
+        sections = _split_into_sections(page.content)
+        raw_chunks: list[dict] = []
+        for section in sections:
+            section_chunks = self._chunk_section(
+                text=section["text"],
+                heading=section["heading"],
+                char_offset=section["char_start"],
+                source_file=page.source_file,
+                url=page.url,
+                doc_title=page.title,
+            )
+            raw_chunks.extend(section_chunks)
+        # Assign IDs and build linked list
+        chunks = []
+        for i, raw in enumerate(raw_chunks):
+            cid = start_chunk_id + i
+            chunk = Chunk(
+                chunk_id=cid,
+                text=raw["text"],
+                source_file=raw["source_file"],
+                section_heading=raw["heading"],
+                char_start=raw["char_start"],
+                char_end=raw["char_end"],
+                token_estimate=_estimate_tokens(raw["text"]),
+                prev_chunk_id=cid - 1 if i > 0 else None,
+                next_chunk_id=cid + 1 if i < len(raw_chunks) - 1 else None,
+                url=raw["url"],
+                doc_title=raw["doc_title"],
+            )
+            chunks.append(chunk)
+        return chunks
+    def chunk_pages(self, pages: list[DocPage]) -> list[Chunk]:
+        """Chunk multiple pages, maintaining global chunk IDs and linked lists."""
+        all_chunks = []
+        current_id = 0
+        for page in pages:
+            page_chunks = self.chunk_page(page, start_chunk_id=current_id)
+            # Break cross-page links: first chunk of new page has no prev from old page
+            if all_chunks and page_chunks:
+                # Last chunk of previous page shouldn't link to first of new page
+                all_chunks[-1] = Chunk(
+                    **{**all_chunks[-1].to_dict(), "next_chunk_id": None}
+                )
+                page_chunks[0] = Chunk(
+                    **{**page_chunks[0].to_dict(), "prev_chunk_id": None}
+                )
+            all_chunks.extend(page_chunks)
+            current_id += len(page_chunks)
+        return all_chunks
+    def _chunk_section(
+        self,
+        text: str,
+        heading: str,
+        char_offset: int,
+        source_file: str,
+        url: str,
+        doc_title: str,
+    ) -> list[dict]:
+        """Split a section into token-bounded chunks, preserving code blocks."""
+        segments = _split_preserving_code_blocks(text)
+        chunks = []
+        current_text = ""
+        current_start = char_offset
+        for seg in segments:
+            seg_tokens = _estimate_tokens(seg["text"])
+            if seg["is_code"]:
+                # Code blocks are kept intact even if they exceed max_tokens
+                if current_text.strip():
+                    # Flush accumulated text first
+                    cur_tokens = _estimate_tokens(current_text)
+                    if cur_tokens > 0:
+                        chunks.append({
+                            "text": current_text.strip(),
+                            "heading": heading,
+                            "char_start": current_start,
+                            "char_end": current_start + len(current_text),
+                            "source_file": source_file,
+                            "url": url,
+                            "doc_title": doc_title,
+                        })
+                    current_start += len(current_text)
+                    current_text = ""
+                # Add code block as its own chunk
+                chunks.append({
+                    "text": seg["text"].strip(),
+                    "heading": heading,
+                    "char_start": current_start,
+                    "char_end": current_start + len(seg["text"]),
+                    "source_file": source_file,
+                    "url": url,
+                    "doc_title": doc_title,
+                })
+                current_start += len(seg["text"])
+            else:
+                # Regular text — split by paragraphs/sentences if needed
+                paragraphs = re.split(r'\n\n+', seg["text"])
+                for para in paragraphs:
+                    para_tokens = _estimate_tokens(para)
+                    if _estimate_tokens(current_text) + para_tokens > self.max_tokens and current_text.strip():
+                        # Flush current chunk
+                        chunks.append({
+                            "text": current_text.strip(),
+                            "heading": heading,
+                            "char_start": current_start,
+                            "char_end": current_start + len(current_text),
+                            "source_file": source_file,
+                            "url": url,
+                            "doc_title": doc_title,
+                        })
+                        current_start += len(current_text)
+                        current_text = ""
+                    if para_tokens > self.max_tokens:
+                        # Very long paragraph — split by sentences
+                        sentences = re.split(r'(?<=[.!?])\s+', para)
+                        for sent in sentences:
+                            sent_tokens = _estimate_tokens(sent)
+                            if _estimate_tokens(current_text) + sent_tokens > self.max_tokens and current_text.strip():
+                                chunks.append({
+                                    "text": current_text.strip(),
+                                    "heading": heading,
+                                    "char_start": current_start,
+                                    "char_end": current_start + len(current_text),
+                                    "source_file": source_file,
+                                    "url": url,
+                                    "doc_title": doc_title,
+                                })
+                                current_start += len(current_text)
+                                current_text = ""
+                            current_text += sent + " "
+                    else:
+                        current_text += para + "\n\n"
+        # Flush remaining
+        if current_text.strip():
+            chunks.append({
+                "text": current_text.strip(),
+                "heading": heading,
+                "char_start": current_start,
+                "char_end": current_start + len(current_text),
+                "source_file": source_file,
+                "url": url,
+                "doc_title": doc_title,
+            })
+        return chunks