""" Code parser (tree-sitter). Turns a Python source file into a list of *definitions* -- functions, classes, and methods -- each with the exact source text and 1-indexed start/end lines. Those line numbers are what make citations exact and un-hallucinatable later. It handles the real-world cases that trip up naive parsing: - methods inside classes (named "ClassName.method", type "method") - decorators (@app.route etc. -> included in the chunk + lines) - nested functions (found via full-tree recursion) - module-level imports (collected into one "module" context chunk) Run standalone to inspect a repo's definitions: python -m src.ingestion.parser path/to/repo_dir """ import tree_sitter_python as tspython from tree_sitter import Language, Parser _PY_LANGUAGE = Language(tspython.language()) _parser = Parser(_PY_LANGUAGE) # Top-level nodes we fold into a per-file "module" chunk (imports + config). _MODULE_CONTEXT_TYPES = { "import_statement", "import_from_statement", "expression_statement", # top-level assignments / constants } def _node_text(src_bytes, node): return src_bytes[node.start_byte:node.end_byte].decode("utf-8", errors="ignore") def _name_of(src_bytes, def_node): n = def_node.child_by_field_name("name") return _node_text(src_bytes, n) if n is not None else "" def _unwrap(node): """ If node is (or wraps) a function/class definition, return (def_node_for_naming, outer_node_for_range); else None. decorated_definition wraps the real def with its decorators, so we name from the inner def but take the code/line range from the outer node so the decorators are part of the chunk. """ if node.type in ("function_definition", "class_definition"): return node, node if node.type == "decorated_definition": for child in node.children: if child.type in ("function_definition", "class_definition"): return child, node return None def extract_definitions(rel_path, source): """Return a list of definition dicts for one file.""" src_bytes = source.encode("utf-8") root = _parser.parse(src_bytes).root_node defs = [] def emit(def_node, outer, parent_class): is_class = def_node.type == "class_definition" kind = "class" if is_class else ("method" if parent_class else "function") name = _name_of(src_bytes, def_node) full_name = f"{parent_class}.{name}" if parent_class else name defs.append({ "file": rel_path, "type": kind, "name": full_name, "start_line": outer.start_point[0] + 1, "end_line": outer.end_point[0] + 1, "code": _node_text(src_bytes, outer), }) return name if is_class else None def walk(node, parent_class): for child in node.children: info = _unwrap(child) if info: def_node, outer = info new_parent = emit(def_node, outer, parent_class) # descend into the body: a class's inner funcs become methods walk(def_node, new_parent) else: # not a definition -> keep descending to find nested defs walk(child, parent_class) walk(root, parent_class=None) # one "module" chunk per file: top-level imports + constants context_nodes = [c for c in root.children if c.type in _MODULE_CONTEXT_TYPES] if context_nodes: defs.append({ "file": rel_path, "type": "module", "name": rel_path, "start_line": context_nodes[0].start_point[0] + 1, "end_line": context_nodes[-1].end_point[0] + 1, "code": "\n".join(_node_text(src_bytes, n) for n in context_nodes), }) return defs def parse_files(files): """Run extract_definitions over scanner output (list of file records).""" all_defs = [] for f in files: all_defs.extend(extract_definitions(f["rel_path"], f["source"])) return all_defs if __name__ == "__main__": import sys from src.ingestion.scanner import scan_python_files, scan_repo if len(sys.argv) < 2: print("usage: python -m src.ingestion.parser ") sys.exit(1) target = sys.argv[1] files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target) defs = parse_files(files) print(f"Extracted {len(defs)} definitions from {len(files)} files:\n") for d in defs: print(f" [{d['type']:8}] {d['file']}:{d['start_line']}-{d['end_line']} {d['name']}")