Spaces:
Sleeping
Sleeping
File size: 5,133 Bytes
463fc7e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | """
Tree-sitter based syntactic chunker - Span enrichment and fallback parser.
This module provides byte-level precise chunking using Tree-sitter, which
serves as a structural fallback and span enrichment layer. Tree-sitter is
language-aware and robust against malformed code, making it ideal for
extracting exact byte spans and as a backup parser.
ARCHITECTURE POSITION:
- Enrichment Layer: Provides byte-level precision
- Fallback Parser: Robust parsing for malformed code
- Span Authority: Source of truth for byte positions
KEY FEATURES:
1. Byte-level accurate spans (exact source positions)
2. Language-aware parsing (supports multiple languages)
3. Robust against syntax errors
4. Extracts structural nodes even from partial code
FLOW:
File → Tree-sitter parser → Structural nodes → Spans for enrichment
USAGE:
from ts_chunker import extract_ts_chunks
chunks = extract_ts_chunks(Path("file.py"))
NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with
precise byte spans and serve as fallback for syntax errors.
"""
from pathlib import Path
from typing import List, Optional, Literal, Dict, Tuple
from tree_sitter import Parser, Language, Node
import tree_sitter_python as tspython
from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType
# ----------------------------
# Types
# ----------------------------
TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = {
"module": "module",
"class_definition": "class",
"function_definition": "function",
"async_function_definition": "function",
"import_statement": "imports",
"import_from_statement": "imports",
}
MAX_TS_DEPTH = 3 # module → imports → class/function → method
# ----------------------------
# Helpers
# ----------------------------
def _safe_decode(data: bytes) -> str:
try:
return data.decode("utf-8")
except UnicodeDecodeError:
return data.decode("utf-8", errors="ignore")
def _get_node_name(node: Node) -> Optional[str]:
"""
Extract identifier name for class / function nodes.
"""
for child in node.children:
if child.type == "identifier":
text = child.text
if isinstance(text, (bytes, bytearray)):
return _safe_decode(text)
return None
# ----------------------------
# Public API
# ----------------------------
def extract_ts_chunks(file_path: Path) -> List[CodeChunk]:
source_bytes = file_path.read_bytes()
language = Language(tspython.language())
parser = Parser(language=language)
tree = parser.parse(source_bytes)
root = tree.root_node
chunks: List[CodeChunk] = []
def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None:
if depth > MAX_TS_DEPTH:
return
node_type = node.type
if node_type in TS_TO_CHUNK_TYPE:
code_bytes = source_bytes[node.start_byte : node.end_byte]
code = _safe_decode(code_bytes)
chunk_type = TS_TO_CHUNK_TYPE[node_type]
name = _get_node_name(node)
# For imports, use the full import as name
if chunk_type == "imports":
name = code.strip()
# Create chunk with byte-level precision
chunks.append(
CodeChunk(
chunk_id=f"ts_{node.start_byte}_{node.end_byte}",
file_path=str(file_path),
language="python",
chunk_type=chunk_type,
code=code,
ast=ChunkAST(
symbol_type=None, # TS doesn't provide semantic types
name=name,
parent=None, # Parent relationships from AST
docstring=None,
decorators=[],
imports=[],
node_type=node_type,
),
span=ChunkSpan(
start_byte=node.start_byte,
end_byte=node.end_byte,
start_line=node.start_point[0] + 1,
end_line=node.end_point[0] + 1,
char_count=len(code),
),
hierarchy=ChunkHierarchy(
is_primary=False, # Tree-sitter chunks are for span enrichment only
is_extracted=True,
depth=depth,
parent_id=None, # Parent relationships from AST
),
metadata={
"byte_span": {
"start": node.start_byte,
"end": node.end_byte,
},
"tree_sitter_node_type": node_type,
"is_exact_span": True,
},
)
)
for child in node.children:
walk(child, depth + 1, node)
walk(root)
return chunks |