Spaces:
Running
Running
File size: 4,679 Bytes
8e72e1f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | """
Code parser (tree-sitter).
Turns a Python source file into a list of *definitions* -- functions, classes,
and methods -- each with the exact source text and 1-indexed start/end lines.
Those line numbers are what make citations exact and un-hallucinatable later.
It handles the real-world cases that trip up naive parsing:
- methods inside classes (named "ClassName.method", type "method")
- decorators (@app.route etc. -> included in the chunk + lines)
- nested functions (found via full-tree recursion)
- module-level imports (collected into one "module" context chunk)
Run standalone to inspect a repo's definitions:
python -m src.ingestion.parser path/to/repo_dir
"""
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
_PY_LANGUAGE = Language(tspython.language())
_parser = Parser(_PY_LANGUAGE)
# Top-level nodes we fold into a per-file "module" chunk (imports + config).
_MODULE_CONTEXT_TYPES = {
"import_statement",
"import_from_statement",
"expression_statement", # top-level assignments / constants
}
def _node_text(src_bytes, node):
return src_bytes[node.start_byte:node.end_byte].decode("utf-8", errors="ignore")
def _name_of(src_bytes, def_node):
n = def_node.child_by_field_name("name")
return _node_text(src_bytes, n) if n is not None else "<anonymous>"
def _unwrap(node):
"""
If node is (or wraps) a function/class definition, return
(def_node_for_naming, outer_node_for_range); else None.
decorated_definition wraps the real def with its decorators, so we name
from the inner def but take the code/line range from the outer node so the
decorators are part of the chunk.
"""
if node.type in ("function_definition", "class_definition"):
return node, node
if node.type == "decorated_definition":
for child in node.children:
if child.type in ("function_definition", "class_definition"):
return child, node
return None
def extract_definitions(rel_path, source):
"""Return a list of definition dicts for one file."""
src_bytes = source.encode("utf-8")
root = _parser.parse(src_bytes).root_node
defs = []
def emit(def_node, outer, parent_class):
is_class = def_node.type == "class_definition"
kind = "class" if is_class else ("method" if parent_class else "function")
name = _name_of(src_bytes, def_node)
full_name = f"{parent_class}.{name}" if parent_class else name
defs.append({
"file": rel_path,
"type": kind,
"name": full_name,
"start_line": outer.start_point[0] + 1,
"end_line": outer.end_point[0] + 1,
"code": _node_text(src_bytes, outer),
})
return name if is_class else None
def walk(node, parent_class):
for child in node.children:
info = _unwrap(child)
if info:
def_node, outer = info
new_parent = emit(def_node, outer, parent_class)
# descend into the body: a class's inner funcs become methods
walk(def_node, new_parent)
else:
# not a definition -> keep descending to find nested defs
walk(child, parent_class)
walk(root, parent_class=None)
# one "module" chunk per file: top-level imports + constants
context_nodes = [c for c in root.children if c.type in _MODULE_CONTEXT_TYPES]
if context_nodes:
defs.append({
"file": rel_path,
"type": "module",
"name": rel_path,
"start_line": context_nodes[0].start_point[0] + 1,
"end_line": context_nodes[-1].end_point[0] + 1,
"code": "\n".join(_node_text(src_bytes, n) for n in context_nodes),
})
return defs
def parse_files(files):
"""Run extract_definitions over scanner output (list of file records)."""
all_defs = []
for f in files:
all_defs.extend(extract_definitions(f["rel_path"], f["source"]))
return all_defs
if __name__ == "__main__":
import sys
from src.ingestion.scanner import scan_python_files, scan_repo
if len(sys.argv) < 2:
print("usage: python -m src.ingestion.parser <repo.zip | repo_dir>")
sys.exit(1)
target = sys.argv[1]
files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target)
defs = parse_files(files)
print(f"Extracted {len(defs)} definitions from {len(files)} files:\n")
for d in defs:
print(f" [{d['type']:8}] {d['file']}:{d['start_line']}-{d['end_line']} {d['name']}") |