""" codegraph/extractor.py — V2 Multi-language metadata extractor. V1 used Python's ast module → Python-only, returned empty object on SyntaxError. V2 uses tree-sitter → Python + JS + TS + TSX with same API. V2 also returns structured SyntaxError with line + message → agent can fix it. tree-sitter is error-tolerant: returns a partial parse tree even for broken code, so we always get *some* metadata even from syntactically broken submissions. """ import ast as pyast from typing import Dict, Any # ── tree-sitter setup ───────────────────────────────────────────────────────── _PARSERS: Dict[str, Any] = {} def _get_parser(ext: str): """Lazy-load language parser. Falls back to Python if grammar unavailable.""" global _PARSERS if ext in _PARSERS: return _PARSERS[ext] try: from tree_sitter import Language, Parser if ext in (".py",): import tree_sitter_python as tspython lang = Language(tspython.language()) elif ext in (".js", ".ts", ".tsx", ".jsx"): import tree_sitter_javascript as tsjavascript lang = Language(tsjavascript.language()) else: import tree_sitter_python as tspython lang = Language(tspython.language()) parser = Parser(lang) _PARSERS[ext] = parser return parser except Exception: # tree-sitter not installed → signal caller to use ast-only path _PARSERS[ext] = None return None def extract_metadata(code: str, filename: str, step: int) -> Dict[str, Any]: """ Extract structured metadata from agent code. Returns: dict with keys: status, functions, imports, conventions, language, created_at_step On syntax error: status='syntax_error', error, line, col, feedback V2 guarantee: always returns a dict, never raises. """ ext = _get_ext(filename) # ── Python path: try ast for exact SyntaxError info ────────────────────── if ext == ".py": try: pyast.parse(code) except SyntaxError as e: return { "status": "syntax_error", "error": str(e.msg), "line": e.lineno, "col": e.offset, "feedback": f"SyntaxError line {e.lineno}: {e.msg}. Fix before grading.", "functions": [], "imports": [], "conventions": {}, "created_at_step": step, "language": "py", } # ── tree-sitter parse (works even on broken JS/TS) ──────────────────────── parser = _get_parser(ext) functions, imports = [], [] if parser: try: tree = parser.parse(code.encode()) def walk(node): if node.type in ( "function_definition", "function_declaration", "arrow_function", "method_definition", ): name_node = node.child_by_field_name("name") if name_node: functions.append({ "name": name_node.text.decode(), "start_line": node.start_point[0], }) if node.type in ( "import_statement", "import_from_statement", "import_declaration", ): imports.append(node.text.decode()[:120]) for child in node.children: walk(child) walk(tree.root_node) except Exception: pass # Partial results are fine # ── Fallback: pure ast for Python when tree-sitter unavailable ─────────── if not functions and ext == ".py": try: tree = pyast.parse(code) for node in pyast.walk(tree): if isinstance(node, pyast.FunctionDef): functions.append({"name": node.name, "start_line": node.lineno}) if isinstance(node, pyast.Import): imports += [a.name for a in node.names] if isinstance(node, pyast.ImportFrom) and node.module: imports.append(node.module) except Exception: pass conventions = { "uses_try_catch": "try:" in code or "try {" in code, "uses_type_hints": (": " in code and " -> " in code) or ": str" in code or ": int" in code, "no_print_stmts": "print(" not in code, "uses_docstrings": '"""' in code or "'''" in code, "language": ext.lstrip("."), } return { "status": "ok", "functions": functions, "imports": imports, "conventions": conventions, "created_at_step": step, "language": ext.lstrip("."), } def _get_ext(filename: str) -> str: if "." in filename: return "." + filename.rsplit(".", 1)[-1].lower() return ".py"