CodeMode

Sleeping

CodeMode / scripts /core /ingestion /ts_chunker.py

CodeMode Agent

Deploy CodeMode via Agent

463fc7e 4 months ago

5.13 kB

	"""
	Tree-sitter based syntactic chunker - Span enrichment and fallback parser.

	This module provides byte-level precise chunking using Tree-sitter, which
	serves as a structural fallback and span enrichment layer. Tree-sitter is
	language-aware and robust against malformed code, making it ideal for
	extracting exact byte spans and as a backup parser.

	ARCHITECTURE POSITION:
	- Enrichment Layer: Provides byte-level precision
	- Fallback Parser: Robust parsing for malformed code
	- Span Authority: Source of truth for byte positions

	KEY FEATURES:
	1. Byte-level accurate spans (exact source positions)
	2. Language-aware parsing (supports multiple languages)
	3. Robust against syntax errors
	4. Extracts structural nodes even from partial code

	FLOW:
	File → Tree-sitter parser → Structural nodes → Spans for enrichment

	USAGE:
	from ts_chunker import extract_ts_chunks
	chunks = extract_ts_chunks(Path("file.py"))

	NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with
	precise byte spans and serve as fallback for syntax errors.
	"""

	from pathlib import Path
	from typing import List, Optional, Literal, Dict, Tuple

	from tree_sitter import Parser, Language, Node
	import tree_sitter_python as tspython

	from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType

	# ----------------------------
	# Types
	# ----------------------------

	TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = {
	"module": "module",
	"class_definition": "class",
	"function_definition": "function",
	"async_function_definition": "function",
	"import_statement": "imports",
	"import_from_statement": "imports",
	}

	MAX_TS_DEPTH = 3 # module → imports → class/function → method


	# ----------------------------
	# Helpers
	# ----------------------------

	def _safe_decode(data: bytes) -> str:
	try:
	return data.decode("utf-8")
	except UnicodeDecodeError:
	return data.decode("utf-8", errors="ignore")


	def _get_node_name(node: Node) -> Optional[str]:
	"""
	Extract identifier name for class / function nodes.
	"""
	for child in node.children:
	if child.type == "identifier":
	text = child.text
	if isinstance(text, (bytes, bytearray)):
	return _safe_decode(text)
	return None


	# ----------------------------
	# Public API
	# ----------------------------

	def extract_ts_chunks(file_path: Path) -> List[CodeChunk]:
	source_bytes = file_path.read_bytes()

	language = Language(tspython.language())
	parser = Parser(language=language)

	tree = parser.parse(source_bytes)
	root = tree.root_node

	chunks: List[CodeChunk] = []

	def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None:
	if depth > MAX_TS_DEPTH:
	return

	node_type = node.type

	if node_type in TS_TO_CHUNK_TYPE:
	code_bytes = source_bytes[node.start_byte : node.end_byte]
	code = _safe_decode(code_bytes)

	chunk_type = TS_TO_CHUNK_TYPE[node_type]
	name = _get_node_name(node)

	# For imports, use the full import as name
	if chunk_type == "imports":
	name = code.strip()

	# Create chunk with byte-level precision
	chunks.append(
	CodeChunk(
	chunk_id=f"ts_{node.start_byte}_{node.end_byte}",
	file_path=str(file_path),
	language="python",
	chunk_type=chunk_type,
	code=code,
	ast=ChunkAST(
	symbol_type=None, # TS doesn't provide semantic types
	name=name,
	parent=None, # Parent relationships from AST
	docstring=None,
	decorators=[],
	imports=[],
	node_type=node_type,
	),
	span=ChunkSpan(
	start_byte=node.start_byte,
	end_byte=node.end_byte,
	start_line=node.start_point[0] + 1,
	end_line=node.end_point[0] + 1,
	char_count=len(code),
	),
	hierarchy=ChunkHierarchy(
	is_primary=False, # Tree-sitter chunks are for span enrichment only
	is_extracted=True,
	depth=depth,
	parent_id=None, # Parent relationships from AST
	),
	metadata={
	"byte_span": {
	"start": node.start_byte,
	"end": node.end_byte,
	},
	"tree_sitter_node_type": node_type,
	"is_exact_span": True,
	},
	)
	)

	for child in node.children:
	walk(child, depth + 1, node)

	walk(root)
	return chunks