AishaSurve's picture
Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval
8e72e1f
Raw
History Blame Contribute Delete
4.68 kB
"""
Code parser (tree-sitter).
Turns a Python source file into a list of *definitions* -- functions, classes,
and methods -- each with the exact source text and 1-indexed start/end lines.
Those line numbers are what make citations exact and un-hallucinatable later.
It handles the real-world cases that trip up naive parsing:
- methods inside classes (named "ClassName.method", type "method")
- decorators (@app.route etc. -> included in the chunk + lines)
- nested functions (found via full-tree recursion)
- module-level imports (collected into one "module" context chunk)
Run standalone to inspect a repo's definitions:
python -m src.ingestion.parser path/to/repo_dir
"""
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
_PY_LANGUAGE = Language(tspython.language())
_parser = Parser(_PY_LANGUAGE)
# Top-level nodes we fold into a per-file "module" chunk (imports + config).
_MODULE_CONTEXT_TYPES = {
"import_statement",
"import_from_statement",
"expression_statement", # top-level assignments / constants
}
def _node_text(src_bytes, node):
return src_bytes[node.start_byte:node.end_byte].decode("utf-8", errors="ignore")
def _name_of(src_bytes, def_node):
n = def_node.child_by_field_name("name")
return _node_text(src_bytes, n) if n is not None else "<anonymous>"
def _unwrap(node):
"""
If node is (or wraps) a function/class definition, return
(def_node_for_naming, outer_node_for_range); else None.
decorated_definition wraps the real def with its decorators, so we name
from the inner def but take the code/line range from the outer node so the
decorators are part of the chunk.
"""
if node.type in ("function_definition", "class_definition"):
return node, node
if node.type == "decorated_definition":
for child in node.children:
if child.type in ("function_definition", "class_definition"):
return child, node
return None
def extract_definitions(rel_path, source):
"""Return a list of definition dicts for one file."""
src_bytes = source.encode("utf-8")
root = _parser.parse(src_bytes).root_node
defs = []
def emit(def_node, outer, parent_class):
is_class = def_node.type == "class_definition"
kind = "class" if is_class else ("method" if parent_class else "function")
name = _name_of(src_bytes, def_node)
full_name = f"{parent_class}.{name}" if parent_class else name
defs.append({
"file": rel_path,
"type": kind,
"name": full_name,
"start_line": outer.start_point[0] + 1,
"end_line": outer.end_point[0] + 1,
"code": _node_text(src_bytes, outer),
})
return name if is_class else None
def walk(node, parent_class):
for child in node.children:
info = _unwrap(child)
if info:
def_node, outer = info
new_parent = emit(def_node, outer, parent_class)
# descend into the body: a class's inner funcs become methods
walk(def_node, new_parent)
else:
# not a definition -> keep descending to find nested defs
walk(child, parent_class)
walk(root, parent_class=None)
# one "module" chunk per file: top-level imports + constants
context_nodes = [c for c in root.children if c.type in _MODULE_CONTEXT_TYPES]
if context_nodes:
defs.append({
"file": rel_path,
"type": "module",
"name": rel_path,
"start_line": context_nodes[0].start_point[0] + 1,
"end_line": context_nodes[-1].end_point[0] + 1,
"code": "\n".join(_node_text(src_bytes, n) for n in context_nodes),
})
return defs
def parse_files(files):
"""Run extract_definitions over scanner output (list of file records)."""
all_defs = []
for f in files:
all_defs.extend(extract_definitions(f["rel_path"], f["source"]))
return all_defs
if __name__ == "__main__":
import sys
from src.ingestion.scanner import scan_python_files, scan_repo
if len(sys.argv) < 2:
print("usage: python -m src.ingestion.parser <repo.zip | repo_dir>")
sys.exit(1)
target = sys.argv[1]
files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target)
defs = parse_files(files)
print(f"Extracted {len(defs)} definitions from {len(files)} files:\n")
for d in defs:
print(f" [{d['type']:8}] {d['file']}:{d['start_line']}-{d['end_line']} {d['name']}")