NLP_Lab / src /chunk.py
apytel
Redesigns UI for FreeCAD RAG Python script generator
11ba2bd
Raw
History Blame Contribute Delete
4.03 kB
"""Markdown-aware, code-block-preserving chunker for FreeCAD wiki pages."""
import re
import uuid
from typing import Any
import tiktoken
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from src.config import CHUNK_SIZE, CHUNK_OVERLAP
_enc = tiktoken.get_encoding("cl100k_base")
# Matches fenced code blocks (``` or ~~~, with optional language tag)
_FENCE_RE = re.compile(r"(```[\w]*\n.*?```|~~~[\w]*\n.*?~~~)", re.DOTALL)
_HEADERS_TO_SPLIT = [("#", "h1"), ("##", "h2"), ("###", "h3")]
_SPLITTER = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", ". ", " ", ""],
chunk_size=CHUNK_SIZE * 4, # chars; ~4 chars per token
chunk_overlap=CHUNK_OVERLAP * 4,
length_function=len,
)
def _count_tokens(text: str) -> int:
return len(_enc.encode(text))
def _protect_code_blocks(text: str) -> tuple[str, dict[str, str]]:
"""Replace fenced code blocks with stable placeholders. Returns modified text + map."""
placeholders: dict[str, str] = {}
def replace(m: re.Match) -> str:
key = f"__CODEBLOCK_{uuid.uuid4().hex}__"
placeholders[key] = m.group(0)
return key
return _FENCE_RE.sub(replace, text), placeholders
def _restore_code_blocks(text: str, placeholders: dict[str, str]) -> str:
for key, code in placeholders.items():
text = text.replace(key, code)
return text
def _classify(text: str) -> str:
has_code = bool(_FENCE_RE.search(text)) or bool(re.search(r"^\s{4}", text, re.MULTILINE))
has_prose = bool(re.search(r"[a-zA-Z]{20,}", text))
if has_code and has_prose:
return "mixed"
if has_code:
return "code"
return "text"
def chunk_page(page: dict[str, Any]) -> list[dict[str, Any]]:
"""
Split one wiki page dict into a list of chunk dicts ready for embedding.
page keys: source_file, page_title, source_url, raw_text, priority
"""
raw = page["raw_text"]
protected, placeholders = _protect_code_blocks(raw)
# Structural split on headers
header_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=_HEADERS_TO_SPLIT, strip_headers=False
)
sections = header_splitter.split_text(protected)
chunks: list[dict[str, Any]] = []
for sec in sections:
content = sec.page_content
meta = sec.metadata # {"h1": ..., "h2": ..., "h3": ...}
# Restore code blocks before deciding whether to split further
restored = _restore_code_blocks(content, placeholders)
tok_len = _count_tokens(restored)
if tok_len <= CHUNK_SIZE:
candidates = [restored]
else:
# Re-protect code blocks for the recursive splitter
protected2, ph2 = _protect_code_blocks(restored)
raw_splits = _SPLITTER.split_text(protected2)
candidates = [_restore_code_blocks(s, ph2) for s in raw_splits]
section_label = meta.get("h3") or meta.get("h2") or meta.get("h1") or ""
for text in candidates:
text = text.strip()
if not text or _count_tokens(text) < 30:
continue
# Build preamble for BM25/embedding quality
preamble = f"[Page: {page['page_title']} | Section: {section_label}]\n" if section_label else f"[Page: {page['page_title']}]\n"
full_text = preamble + text
chunks.append({
"source_file": page["source_file"],
"source_url": page["source_url"],
"page_title": page["page_title"],
"section": section_label,
"type": _classify(text),
"text": full_text,
"token_len": _count_tokens(full_text),
"char_len": len(full_text),
})
return chunks
def chunk_pages(pages: list[dict]) -> list[dict]:
all_chunks: list[dict] = []
for page in pages:
all_chunks.extend(chunk_page(page))
for i, c in enumerate(all_chunks):
c["chunk_id"] = i
return all_chunks