CodeMode / scripts /core /ingestion /ts_chunker.py
CodeMode Agent
Deploy CodeMode via Agent
463fc7e
"""
Tree-sitter based syntactic chunker - Span enrichment and fallback parser.
This module provides byte-level precise chunking using Tree-sitter, which
serves as a structural fallback and span enrichment layer. Tree-sitter is
language-aware and robust against malformed code, making it ideal for
extracting exact byte spans and as a backup parser.
ARCHITECTURE POSITION:
- Enrichment Layer: Provides byte-level precision
- Fallback Parser: Robust parsing for malformed code
- Span Authority: Source of truth for byte positions
KEY FEATURES:
1. Byte-level accurate spans (exact source positions)
2. Language-aware parsing (supports multiple languages)
3. Robust against syntax errors
4. Extracts structural nodes even from partial code
FLOW:
File → Tree-sitter parser → Structural nodes → Spans for enrichment
USAGE:
from ts_chunker import extract_ts_chunks
chunks = extract_ts_chunks(Path("file.py"))
NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with
precise byte spans and serve as fallback for syntax errors.
"""
from pathlib import Path
from typing import List, Optional, Literal, Dict, Tuple
from tree_sitter import Parser, Language, Node
import tree_sitter_python as tspython
from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType
# ----------------------------
# Types
# ----------------------------
TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = {
"module": "module",
"class_definition": "class",
"function_definition": "function",
"async_function_definition": "function",
"import_statement": "imports",
"import_from_statement": "imports",
}
MAX_TS_DEPTH = 3 # module → imports → class/function → method
# ----------------------------
# Helpers
# ----------------------------
def _safe_decode(data: bytes) -> str:
try:
return data.decode("utf-8")
except UnicodeDecodeError:
return data.decode("utf-8", errors="ignore")
def _get_node_name(node: Node) -> Optional[str]:
"""
Extract identifier name for class / function nodes.
"""
for child in node.children:
if child.type == "identifier":
text = child.text
if isinstance(text, (bytes, bytearray)):
return _safe_decode(text)
return None
# ----------------------------
# Public API
# ----------------------------
def extract_ts_chunks(file_path: Path) -> List[CodeChunk]:
source_bytes = file_path.read_bytes()
language = Language(tspython.language())
parser = Parser(language=language)
tree = parser.parse(source_bytes)
root = tree.root_node
chunks: List[CodeChunk] = []
def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None:
if depth > MAX_TS_DEPTH:
return
node_type = node.type
if node_type in TS_TO_CHUNK_TYPE:
code_bytes = source_bytes[node.start_byte : node.end_byte]
code = _safe_decode(code_bytes)
chunk_type = TS_TO_CHUNK_TYPE[node_type]
name = _get_node_name(node)
# For imports, use the full import as name
if chunk_type == "imports":
name = code.strip()
# Create chunk with byte-level precision
chunks.append(
CodeChunk(
chunk_id=f"ts_{node.start_byte}_{node.end_byte}",
file_path=str(file_path),
language="python",
chunk_type=chunk_type,
code=code,
ast=ChunkAST(
symbol_type=None, # TS doesn't provide semantic types
name=name,
parent=None, # Parent relationships from AST
docstring=None,
decorators=[],
imports=[],
node_type=node_type,
),
span=ChunkSpan(
start_byte=node.start_byte,
end_byte=node.end_byte,
start_line=node.start_point[0] + 1,
end_line=node.end_point[0] + 1,
char_count=len(code),
),
hierarchy=ChunkHierarchy(
is_primary=False, # Tree-sitter chunks are for span enrichment only
is_extracted=True,
depth=depth,
parent_id=None, # Parent relationships from AST
),
metadata={
"byte_span": {
"start": node.start_byte,
"end": node.end_byte,
},
"tree_sitter_node_type": node_type,
"is_exact_span": True,
},
)
)
for child in node.children:
walk(child, depth + 1, node)
walk(root)
return chunks