Spaces:
Running
Running
| """ | |
| Code parser (tree-sitter). | |
| Turns a Python source file into a list of *definitions* -- functions, classes, | |
| and methods -- each with the exact source text and 1-indexed start/end lines. | |
| Those line numbers are what make citations exact and un-hallucinatable later. | |
| It handles the real-world cases that trip up naive parsing: | |
| - methods inside classes (named "ClassName.method", type "method") | |
| - decorators (@app.route etc. -> included in the chunk + lines) | |
| - nested functions (found via full-tree recursion) | |
| - module-level imports (collected into one "module" context chunk) | |
| Run standalone to inspect a repo's definitions: | |
| python -m src.ingestion.parser path/to/repo_dir | |
| """ | |
| import tree_sitter_python as tspython | |
| from tree_sitter import Language, Parser | |
| _PY_LANGUAGE = Language(tspython.language()) | |
| _parser = Parser(_PY_LANGUAGE) | |
| # Top-level nodes we fold into a per-file "module" chunk (imports + config). | |
| _MODULE_CONTEXT_TYPES = { | |
| "import_statement", | |
| "import_from_statement", | |
| "expression_statement", # top-level assignments / constants | |
| } | |
| def _node_text(src_bytes, node): | |
| return src_bytes[node.start_byte:node.end_byte].decode("utf-8", errors="ignore") | |
| def _name_of(src_bytes, def_node): | |
| n = def_node.child_by_field_name("name") | |
| return _node_text(src_bytes, n) if n is not None else "<anonymous>" | |
| def _unwrap(node): | |
| """ | |
| If node is (or wraps) a function/class definition, return | |
| (def_node_for_naming, outer_node_for_range); else None. | |
| decorated_definition wraps the real def with its decorators, so we name | |
| from the inner def but take the code/line range from the outer node so the | |
| decorators are part of the chunk. | |
| """ | |
| if node.type in ("function_definition", "class_definition"): | |
| return node, node | |
| if node.type == "decorated_definition": | |
| for child in node.children: | |
| if child.type in ("function_definition", "class_definition"): | |
| return child, node | |
| return None | |
| def extract_definitions(rel_path, source): | |
| """Return a list of definition dicts for one file.""" | |
| src_bytes = source.encode("utf-8") | |
| root = _parser.parse(src_bytes).root_node | |
| defs = [] | |
| def emit(def_node, outer, parent_class): | |
| is_class = def_node.type == "class_definition" | |
| kind = "class" if is_class else ("method" if parent_class else "function") | |
| name = _name_of(src_bytes, def_node) | |
| full_name = f"{parent_class}.{name}" if parent_class else name | |
| defs.append({ | |
| "file": rel_path, | |
| "type": kind, | |
| "name": full_name, | |
| "start_line": outer.start_point[0] + 1, | |
| "end_line": outer.end_point[0] + 1, | |
| "code": _node_text(src_bytes, outer), | |
| }) | |
| return name if is_class else None | |
| def walk(node, parent_class): | |
| for child in node.children: | |
| info = _unwrap(child) | |
| if info: | |
| def_node, outer = info | |
| new_parent = emit(def_node, outer, parent_class) | |
| # descend into the body: a class's inner funcs become methods | |
| walk(def_node, new_parent) | |
| else: | |
| # not a definition -> keep descending to find nested defs | |
| walk(child, parent_class) | |
| walk(root, parent_class=None) | |
| # one "module" chunk per file: top-level imports + constants | |
| context_nodes = [c for c in root.children if c.type in _MODULE_CONTEXT_TYPES] | |
| if context_nodes: | |
| defs.append({ | |
| "file": rel_path, | |
| "type": "module", | |
| "name": rel_path, | |
| "start_line": context_nodes[0].start_point[0] + 1, | |
| "end_line": context_nodes[-1].end_point[0] + 1, | |
| "code": "\n".join(_node_text(src_bytes, n) for n in context_nodes), | |
| }) | |
| return defs | |
| def parse_files(files): | |
| """Run extract_definitions over scanner output (list of file records).""" | |
| all_defs = [] | |
| for f in files: | |
| all_defs.extend(extract_definitions(f["rel_path"], f["source"])) | |
| return all_defs | |
| if __name__ == "__main__": | |
| import sys | |
| from src.ingestion.scanner import scan_python_files, scan_repo | |
| if len(sys.argv) < 2: | |
| print("usage: python -m src.ingestion.parser <repo.zip | repo_dir>") | |
| sys.exit(1) | |
| target = sys.argv[1] | |
| files = scan_repo(target)[0] if target.lower().endswith(".zip") else scan_python_files(target) | |
| defs = parse_files(files) | |
| print(f"Extracted {len(defs)} definitions from {len(files)} files:\n") | |
| for d in defs: | |
| print(f" [{d['type']:8}] {d['file']}:{d['start_line']}-{d['end_line']} {d['name']}") |