Spaces:
Sleeping
Sleeping
| """ | |
| Tree-sitter based syntactic chunker - Span enrichment and fallback parser. | |
| This module provides byte-level precise chunking using Tree-sitter, which | |
| serves as a structural fallback and span enrichment layer. Tree-sitter is | |
| language-aware and robust against malformed code, making it ideal for | |
| extracting exact byte spans and as a backup parser. | |
| ARCHITECTURE POSITION: | |
| - Enrichment Layer: Provides byte-level precision | |
| - Fallback Parser: Robust parsing for malformed code | |
| - Span Authority: Source of truth for byte positions | |
| KEY FEATURES: | |
| 1. Byte-level accurate spans (exact source positions) | |
| 2. Language-aware parsing (supports multiple languages) | |
| 3. Robust against syntax errors | |
| 4. Extracts structural nodes even from partial code | |
| FLOW: | |
| File → Tree-sitter parser → Structural nodes → Spans for enrichment | |
| USAGE: | |
| from ts_chunker import extract_ts_chunks | |
| chunks = extract_ts_chunks(Path("file.py")) | |
| NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with | |
| precise byte spans and serve as fallback for syntax errors. | |
| """ | |
| from pathlib import Path | |
| from typing import List, Optional, Literal, Dict, Tuple | |
| from tree_sitter import Parser, Language, Node | |
| import tree_sitter_python as tspython | |
| from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType | |
| # ---------------------------- | |
| # Types | |
| # ---------------------------- | |
| TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = { | |
| "module": "module", | |
| "class_definition": "class", | |
| "function_definition": "function", | |
| "async_function_definition": "function", | |
| "import_statement": "imports", | |
| "import_from_statement": "imports", | |
| } | |
| MAX_TS_DEPTH = 3 # module → imports → class/function → method | |
| # ---------------------------- | |
| # Helpers | |
| # ---------------------------- | |
| def _safe_decode(data: bytes) -> str: | |
| try: | |
| return data.decode("utf-8") | |
| except UnicodeDecodeError: | |
| return data.decode("utf-8", errors="ignore") | |
| def _get_node_name(node: Node) -> Optional[str]: | |
| """ | |
| Extract identifier name for class / function nodes. | |
| """ | |
| for child in node.children: | |
| if child.type == "identifier": | |
| text = child.text | |
| if isinstance(text, (bytes, bytearray)): | |
| return _safe_decode(text) | |
| return None | |
| # ---------------------------- | |
| # Public API | |
| # ---------------------------- | |
| def extract_ts_chunks(file_path: Path) -> List[CodeChunk]: | |
| source_bytes = file_path.read_bytes() | |
| language = Language(tspython.language()) | |
| parser = Parser(language=language) | |
| tree = parser.parse(source_bytes) | |
| root = tree.root_node | |
| chunks: List[CodeChunk] = [] | |
| def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None: | |
| if depth > MAX_TS_DEPTH: | |
| return | |
| node_type = node.type | |
| if node_type in TS_TO_CHUNK_TYPE: | |
| code_bytes = source_bytes[node.start_byte : node.end_byte] | |
| code = _safe_decode(code_bytes) | |
| chunk_type = TS_TO_CHUNK_TYPE[node_type] | |
| name = _get_node_name(node) | |
| # For imports, use the full import as name | |
| if chunk_type == "imports": | |
| name = code.strip() | |
| # Create chunk with byte-level precision | |
| chunks.append( | |
| CodeChunk( | |
| chunk_id=f"ts_{node.start_byte}_{node.end_byte}", | |
| file_path=str(file_path), | |
| language="python", | |
| chunk_type=chunk_type, | |
| code=code, | |
| ast=ChunkAST( | |
| symbol_type=None, # TS doesn't provide semantic types | |
| name=name, | |
| parent=None, # Parent relationships from AST | |
| docstring=None, | |
| decorators=[], | |
| imports=[], | |
| node_type=node_type, | |
| ), | |
| span=ChunkSpan( | |
| start_byte=node.start_byte, | |
| end_byte=node.end_byte, | |
| start_line=node.start_point[0] + 1, | |
| end_line=node.end_point[0] + 1, | |
| char_count=len(code), | |
| ), | |
| hierarchy=ChunkHierarchy( | |
| is_primary=False, # Tree-sitter chunks are for span enrichment only | |
| is_extracted=True, | |
| depth=depth, | |
| parent_id=None, # Parent relationships from AST | |
| ), | |
| metadata={ | |
| "byte_span": { | |
| "start": node.start_byte, | |
| "end": node.end_byte, | |
| }, | |
| "tree_sitter_node_type": node_type, | |
| "is_exact_span": True, | |
| }, | |
| ) | |
| ) | |
| for child in node.children: | |
| walk(child, depth + 1, node) | |
| walk(root) | |
| return chunks |