CodeMode

Sleeping

File size: 5,133 Bytes

463fc7e

"""
Tree-sitter based syntactic chunker - Span enrichment and fallback parser.

This module provides byte-level precise chunking using Tree-sitter, which
serves as a structural fallback and span enrichment layer. Tree-sitter is
language-aware and robust against malformed code, making it ideal for
extracting exact byte spans and as a backup parser.

ARCHITECTURE POSITION:
    - Enrichment Layer: Provides byte-level precision
    - Fallback Parser: Robust parsing for malformed code
    - Span Authority: Source of truth for byte positions

KEY FEATURES:
    1. Byte-level accurate spans (exact source positions)
    2. Language-aware parsing (supports multiple languages)
    3. Robust against syntax errors
    4. Extracts structural nodes even from partial code

FLOW:
    File → Tree-sitter parser → Structural nodes → Spans for enrichment

USAGE:
    from ts_chunker import extract_ts_chunks
    chunks = extract_ts_chunks(Path("file.py"))
    
NOTE: Tree-sitter chunks are NOT primary - they enrich AST chunks with
      precise byte spans and serve as fallback for syntax errors.
"""

from pathlib import Path
from typing import List, Optional, Literal, Dict, Tuple

from tree_sitter import Parser, Language, Node
import tree_sitter_python as tspython

from .chunk_schema import CodeChunk, ChunkAST, ChunkSpan, ChunkHierarchy, ChunkType

# ----------------------------
# Types
# ----------------------------

TS_TO_CHUNK_TYPE: Dict[str, ChunkType] = {
    "module": "module",
    "class_definition": "class",
    "function_definition": "function",
    "async_function_definition": "function",
    "import_statement": "imports",
    "import_from_statement": "imports",
}

MAX_TS_DEPTH = 3  # module → imports → class/function → method


# ----------------------------
# Helpers
# ----------------------------

def _safe_decode(data: bytes) -> str:
    try:
        return data.decode("utf-8")
    except UnicodeDecodeError:
        return data.decode("utf-8", errors="ignore")


def _get_node_name(node: Node) -> Optional[str]:
    """
    Extract identifier name for class / function nodes.
    """
    for child in node.children:
        if child.type == "identifier":
            text = child.text
            if isinstance(text, (bytes, bytearray)):
                return _safe_decode(text)
    return None


# ----------------------------
# Public API
# ----------------------------

def extract_ts_chunks(file_path: Path) -> List[CodeChunk]:
    source_bytes = file_path.read_bytes()

    language = Language(tspython.language())
    parser = Parser(language=language)

    tree = parser.parse(source_bytes)
    root = tree.root_node

    chunks: List[CodeChunk] = []

    def walk(node: Node, depth: int = 0, parent_node: Optional[Node] = None) -> None:
        if depth > MAX_TS_DEPTH:
            return

        node_type = node.type

        if node_type in TS_TO_CHUNK_TYPE:
            code_bytes = source_bytes[node.start_byte : node.end_byte]
            code = _safe_decode(code_bytes)
            
            chunk_type = TS_TO_CHUNK_TYPE[node_type]
            name = _get_node_name(node)
            
            # For imports, use the full import as name
            if chunk_type == "imports":
                name = code.strip()
            
            # Create chunk with byte-level precision
            chunks.append(
                CodeChunk(
                    chunk_id=f"ts_{node.start_byte}_{node.end_byte}",
                    file_path=str(file_path),
                    language="python",
                    chunk_type=chunk_type,
                    code=code,
                    ast=ChunkAST(
                        symbol_type=None,  # TS doesn't provide semantic types
                        name=name,
                        parent=None,  # Parent relationships from AST
                        docstring=None,
                        decorators=[],
                        imports=[],
                        node_type=node_type,
                    ),
                    span=ChunkSpan(
                        start_byte=node.start_byte,
                        end_byte=node.end_byte,
                        start_line=node.start_point[0] + 1,
                        end_line=node.end_point[0] + 1,
                        char_count=len(code),
                    ),
                    hierarchy=ChunkHierarchy(
                        is_primary=False,  # Tree-sitter chunks are for span enrichment only
                        is_extracted=True,
                        depth=depth,
                        parent_id=None,  # Parent relationships from AST
                    ),
                    metadata={
                        "byte_span": {
                            "start": node.start_byte,
                            "end": node.end_byte,
                        },
                        "tree_sitter_node_type": node_type,
                        "is_exact_span": True,
                    },
                )
            )

        for child in node.children:
            walk(child, depth + 1, node)

    walk(root)
    return chunks