File size: 5,133 Bytes
463fc7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Tree-sitter based syntactic chunker - Span enrichment and fallback parser.

This module provides byte-level precise chunking using Tree-sitter, which
serves as a structural fallback and span enrichment layer. Tree-sitter is
language-aware and robust against malformed code, making it ideal for
extracting exact byte spans and as a backup parser.

ARCHITECTURE POSITION:
    - Enrichment Layer: Provides byte-level precision
    - Fallback Parser: Robust parsing for malformed code
    - Span Authority: Source of truth for byte positions

KEY FEATURES:
    1. Byte-level accurate spans (exact source positions)
    2. Language-aware parsing (supports multiple languages)
    3. Robust against syntax errors
    4. Extracts structural nodes even from partial code

FLOW:
    File → Tree-sitter parser → Structural nodes → Spans for enrichment

USAGE:
    from ts_chunker import extract_ts_chunks
    chunks = extract_ts_chunks(Path("file.py"))
    
NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with
      precise byte spans and serve as fallback for syntax errors.
"""

from pathlib import Path
from typing import List, Optional, Literal, Dict, Tuple

from tree_sitter import Parser, Language, Node
import tree_sitter_python as tspython

from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType

# ----------------------------
# Types
# ----------------------------

TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = {
    "module": "module",
    "class_definition": "class",
    "function_definition": "function",
    "async_function_definition": "function",
    "import_statement": "imports",
    "import_from_statement": "imports",
}

MAX_TS_DEPTH = 3  # module → imports → class/function → method


# ----------------------------
# Helpers
# ----------------------------

def _safe_decode(data: bytes) -> str:
    try:
        return data.decode("utf-8")
    except UnicodeDecodeError:
        return data.decode("utf-8", errors="ignore")


def _get_node_name(node: Node) -> Optional[str]:
    """
    Extract identifier name for class / function nodes.
    """
    for child in node.children:
        if child.type == "identifier":
            text = child.text
            if isinstance(text, (bytes, bytearray)):
                return _safe_decode(text)
    return None


# ----------------------------
# Public API
# ----------------------------

def extract_ts_chunks(file_path: Path) -> List[CodeChunk]:
    source_bytes = file_path.read_bytes()

    language = Language(tspython.language())
    parser = Parser(language=language)

    tree = parser.parse(source_bytes)
    root = tree.root_node

    chunks: List[CodeChunk] = []

    def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None:
        if depth > MAX_TS_DEPTH:
            return

        node_type = node.type

        if node_type in TS_TO_CHUNK_TYPE:
            code_bytes = source_bytes[node.start_byte : node.end_byte]
            code = _safe_decode(code_bytes)
            
            chunk_type = TS_TO_CHUNK_TYPE[node_type]
            name = _get_node_name(node)
            
            # For imports, use the full import as name
            if chunk_type == "imports":
                name = code.strip()
            
            # Create chunk with byte-level precision
            chunks.append(
                CodeChunk(
                    chunk_id=f"ts_{node.start_byte}_{node.end_byte}",
                    file_path=str(file_path),
                    language="python",
                    chunk_type=chunk_type,
                    code=code,
                    ast=ChunkAST(
                        symbol_type=None,  # TS doesn't provide semantic types
                        name=name,
                        parent=None,  # Parent relationships from AST
                        docstring=None,
                        decorators=[],
                        imports=[],
                        node_type=node_type,
                    ),
                    span=ChunkSpan(
                        start_byte=node.start_byte,
                        end_byte=node.end_byte,
                        start_line=node.start_point[0] + 1,
                        end_line=node.end_point[0] + 1,
                        char_count=len(code),
                    ),
                    hierarchy=ChunkHierarchy(
                        is_primary=False,  # Tree-sitter chunks are for span enrichment only
                        is_extracted=True,
                        depth=depth,
                        parent_id=None,  # Parent relationships from AST
                    ),
                    metadata={
                        "byte_span": {
                            "start": node.start_byte,
                            "end": node.end_byte,
                        },
                        "tree_sitter_node_type": node_type,
                        "is_exact_span": True,
                    },
                )
            )

        for child in node.children:
            walk(child, depth + 1, node)

    walk(root)
    return chunks