File size: 4,679 Bytes
8e72e1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Code parser (tree-sitter).

Turns a Python source file into a list of *definitions* -- functions, classes,
and methods -- each with the exact source text and 1-indexed start/end lines.
Those line numbers are what make citations exact and un-hallucinatable later.

It handles the real-world cases that trip up naive parsing:
  - methods inside classes  (named "ClassName.method", type "method")
  - decorators              (@app.route etc. -> included in the chunk + lines)
  - nested functions        (found via full-tree recursion)
  - module-level imports     (collected into one "module" context chunk)

Run standalone to inspect a repo's definitions:
    python -m src.ingestion.parser path/to/repo_dir
"""
import tree_sitter_python as tspython
from tree_sitter import Language, Parser

_PY_LANGUAGE = Language(tspython.language())
_parser = Parser(_PY_LANGUAGE)

# Top-level nodes we fold into a per-file "module" chunk (imports + config).
_MODULE_CONTEXT_TYPES = {
    "import_statement",
    "import_from_statement",
    "expression_statement",  # top-level assignments / constants
}


def _node_text(src_bytes, node):
    return src_bytes[node.start_byte:node.end_byte].decode("utf-8", errors="ignore")


def _name_of(src_bytes, def_node):
    n = def_node.child_by_field_name("name")
    return _node_text(src_bytes, n) if n is not None else "<anonymous>"


def _unwrap(node):
    """
    If node is (or wraps) a function/class definition, return
    (def_node_for_naming, outer_node_for_range); else None.

    decorated_definition wraps the real def with its decorators, so we name
    from the inner def but take the code/line range from the outer node so the
    decorators are part of the chunk.
    """
    if node.type in ("function_definition", "class_definition"):
        return node, node
    if node.type == "decorated_definition":
        for child in node.children:
            if child.type in ("function_definition", "class_definition"):
                return child, node
    return None


def extract_definitions(rel_path, source):
    """Return a list of definition dicts for one file."""
    src_bytes = source.encode("utf-8")
    root = _parser.parse(src_bytes).root_node
    defs = []

    def emit(def_node, outer, parent_class):
        is_class = def_node.type == "class_definition"
        kind = "class" if is_class else ("method" if parent_class else "function")
        name = _name_of(src_bytes, def_node)
        full_name = f"{parent_class}.{name}" if parent_class else name
        defs.append({
            "file": rel_path,
            "type": kind,
            "name": full_name,
            "start_line": outer.start_point[0] + 1,
            "end_line": outer.end_point[0] + 1,
            "code": _node_text(src_bytes, outer),
        })
        return name if is_class else None

    def walk(node, parent_class):
        for child in node.children:
            info = _unwrap(child)
            if info:
                def_node, outer = info
                new_parent = emit(def_node, outer, parent_class)
                # descend into the body: a class's inner funcs become methods
                walk(def_node, new_parent)
            else:
                # not a definition -> keep descending to find nested defs
                walk(child, parent_class)

    walk(root, parent_class=None)

    # one "module" chunk per file: top-level imports + constants
    context_nodes = [c for c in root.children if c.type in _MODULE_CONTEXT_TYPES]
    if context_nodes:
        defs.append({
            "file": rel_path,
            "type": "module",
            "name": rel_path,
            "start_line": context_nodes[0].start_point[0] + 1,
            "end_line": context_nodes[-1].end_point[0] + 1,
            "code": "\n".join(_node_text(src_bytes, n) for n in context_nodes),
        })

    return defs


def parse_files(files):
    """Run extract_definitions over scanner output (list of file records)."""
    all_defs = []
    for f in files:
        all_defs.extend(extract_definitions(f["rel_path"], f["source"]))
    return all_defs


if __name__ == "__main__":
    import sys
    from src.ingestion.scanner import scan_python_files, scan_repo

    if len(sys.argv) < 2:
        print("usage: python -m src.ingestion.parser <repo.zip | repo_dir>")
        sys.exit(1)

    target = sys.argv[1]
    files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target)

    defs = parse_files(files)
    print(f"Extracted {len(defs)} definitions from {len(files)} files:\n")
    for d in defs:
        print(f"  [{d['type']:8}] {d['file']}:{d['start_line']}-{d['end_line']}  {d['name']}")