| """ |
| Tree-sitter based syntactic chunker - Span enrichment and fallback parser. |
| |
| This module provides byte-level precise chunking using Tree-sitter, which |
| serves as a structural fallback and span enrichment layer. Tree-sitter is |
| language-aware and robust against malformed code, making it ideal for |
| extracting exact byte spans and as a backup parser. |
| |
| ARCHITECTURE POSITION: |
| - Enrichment Layer: Provides byte-level precision |
| - Fallback Parser: Robust parsing for malformed code |
| - Span Authority: Source of truth for byte positions |
| |
| KEY FEATURES: |
| 1. Byte-level accurate spans (exact source positions) |
| 2. Language-aware parsing (supports multiple languages) |
| 3. Robust against syntax errors |
| 4. Extracts structural nodes even from partial code |
| |
| FLOW: |
| File β Tree-sitter parser β Structural nodes β Spans for enrichment |
| |
| USAGE: |
| from ts_chunker import extract_ts_chunks |
| chunks = extract_ts_chunks(Path("file.py")) |
| |
| NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with |
| precise byte spans and serve as fallback for syntax errors. |
| """ |
|
|
| from pathlib import Path |
| from typing import List, Optional, Literal, Dict, Tuple |
|
|
| from tree_sitter import Parser, Language, Node |
| import tree_sitter_python as tspython |
|
|
| from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType |
|
|
| |
| |
| |
|
|
| TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = { |
| "module": "module", |
| "class_definition": "class", |
| "function_definition": "function", |
| "async_function_definition": "function", |
| "import_statement": "imports", |
| "import_from_statement": "imports", |
| } |
|
|
| MAX_TS_DEPTH = 3 |
|
|
|
|
| |
| |
| |
|
|
| def _safe_decode(data: bytes) -> str: |
| try: |
| return data.decode("utf-8") |
| except UnicodeDecodeError: |
| return data.decode("utf-8", errors="ignore") |
|
|
|
|
| def _get_node_name(node: Node) -> Optional[str]: |
| """ |
| Extract identifier name for class / function nodes. |
| """ |
| for child in node.children: |
| if child.type == "identifier": |
| text = child.text |
| if isinstance(text, (bytes, bytearray)): |
| return _safe_decode(text) |
| return None |
|
|
|
|
| |
| |
| |
|
|
| def extract_ts_chunks(file_path: Path) -> List[CodeChunk]: |
| source_bytes = file_path.read_bytes() |
|
|
| language = Language(tspython.language()) |
| parser = Parser(language=language) |
|
|
| tree = parser.parse(source_bytes) |
| root = tree.root_node |
|
|
| chunks: List[CodeChunk] = [] |
|
|
| def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None: |
| if depth > MAX_TS_DEPTH: |
| return |
|
|
| node_type = node.type |
|
|
| if node_type in TS_TO_CHUNK_TYPE: |
| code_bytes = source_bytes[node.start_byte : node.end_byte] |
| code = _safe_decode(code_bytes) |
| |
| chunk_type = TS_TO_CHUNK_TYPE[node_type] |
| name = _get_node_name(node) |
| |
| |
| if chunk_type == "imports": |
| name = code.strip() |
| |
| |
| chunks.append( |
| CodeChunk( |
| chunk_id=f"ts_{node.start_byte}_{node.end_byte}", |
| file_path=str(file_path), |
| language="python", |
| chunk_type=chunk_type, |
| code=code, |
| ast=ChunkAST( |
| symbol_type=None, |
| name=name, |
| parent=None, |
| docstring=None, |
| decorators=[], |
| imports=[], |
| node_type=node_type, |
| ), |
| span=ChunkSpan( |
| start_byte=node.start_byte, |
| end_byte=node.end_byte, |
| start_line=node.start_point[0] + 1, |
| end_line=node.end_point[0] + 1, |
| char_count=len(code), |
| ), |
| hierarchy=ChunkHierarchy( |
| is_primary=False, |
| is_extracted=True, |
| depth=depth, |
| parent_id=None, |
| ), |
| metadata={ |
| "byte_span": { |
| "start": node.start_byte, |
| "end": node.end_byte, |
| }, |
| "tree_sitter_node_type": node_type, |
| "is_exact_span": True, |
| }, |
| ) |
| ) |
|
|
| for child in node.children: |
| walk(child, depth + 1, node) |
|
|
| walk(root) |
| return chunks |