Spaces:
Sleeping
Sleeping
| """ | |
| codegraph/extractor.py β V2 Multi-language metadata extractor. | |
| V1 used Python's ast module β Python-only, returned empty object on SyntaxError. | |
| V2 uses tree-sitter β Python + JS + TS + TSX with same API. | |
| V2 also returns structured SyntaxError with line + message β agent can fix it. | |
| tree-sitter is error-tolerant: returns a partial parse tree even for broken code, | |
| so we always get *some* metadata even from syntactically broken submissions. | |
| """ | |
| import ast as pyast | |
| from typing import Dict, Any | |
| # ββ tree-sitter setup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _PARSERS: Dict[str, Any] = {} | |
| def _get_parser(ext: str): | |
| """Lazy-load language parser. Falls back to Python if grammar unavailable.""" | |
| global _PARSERS | |
| if ext in _PARSERS: | |
| return _PARSERS[ext] | |
| try: | |
| from tree_sitter import Language, Parser | |
| if ext in (".py",): | |
| import tree_sitter_python as tspython | |
| lang = Language(tspython.language()) | |
| elif ext in (".js", ".ts", ".tsx", ".jsx"): | |
| import tree_sitter_javascript as tsjavascript | |
| lang = Language(tsjavascript.language()) | |
| else: | |
| import tree_sitter_python as tspython | |
| lang = Language(tspython.language()) | |
| parser = Parser(lang) | |
| _PARSERS[ext] = parser | |
| return parser | |
| except Exception: | |
| # tree-sitter not installed β signal caller to use ast-only path | |
| _PARSERS[ext] = None | |
| return None | |
| def extract_metadata(code: str, filename: str, step: int) -> Dict[str, Any]: | |
| """ | |
| Extract structured metadata from agent code. | |
| Returns: | |
| dict with keys: status, functions, imports, conventions, language, created_at_step | |
| On syntax error: status='syntax_error', error, line, col, feedback | |
| V2 guarantee: always returns a dict, never raises. | |
| """ | |
| ext = _get_ext(filename) | |
| # ββ Python path: try ast for exact SyntaxError info ββββββββββββββββββββββ | |
| if ext == ".py": | |
| try: | |
| pyast.parse(code) | |
| except SyntaxError as e: | |
| return { | |
| "status": "syntax_error", | |
| "error": str(e.msg), | |
| "line": e.lineno, | |
| "col": e.offset, | |
| "feedback": f"SyntaxError line {e.lineno}: {e.msg}. Fix before grading.", | |
| "functions": [], | |
| "imports": [], | |
| "conventions": {}, | |
| "created_at_step": step, | |
| "language": "py", | |
| } | |
| # ββ tree-sitter parse (works even on broken JS/TS) ββββββββββββββββββββββββ | |
| parser = _get_parser(ext) | |
| functions, imports = [], [] | |
| if parser: | |
| try: | |
| tree = parser.parse(code.encode()) | |
| def walk(node): | |
| if node.type in ( | |
| "function_definition", "function_declaration", | |
| "arrow_function", "method_definition", | |
| ): | |
| name_node = node.child_by_field_name("name") | |
| if name_node: | |
| functions.append({ | |
| "name": name_node.text.decode(), | |
| "start_line": node.start_point[0], | |
| }) | |
| if node.type in ( | |
| "import_statement", "import_from_statement", | |
| "import_declaration", | |
| ): | |
| imports.append(node.text.decode()[:120]) | |
| for child in node.children: | |
| walk(child) | |
| walk(tree.root_node) | |
| except Exception: | |
| pass # Partial results are fine | |
| # ββ Fallback: pure ast for Python when tree-sitter unavailable βββββββββββ | |
| if not functions and ext == ".py": | |
| try: | |
| tree = pyast.parse(code) | |
| for node in pyast.walk(tree): | |
| if isinstance(node, pyast.FunctionDef): | |
| functions.append({"name": node.name, "start_line": node.lineno}) | |
| if isinstance(node, pyast.Import): | |
| imports += [a.name for a in node.names] | |
| if isinstance(node, pyast.ImportFrom) and node.module: | |
| imports.append(node.module) | |
| except Exception: | |
| pass | |
| conventions = { | |
| "uses_try_catch": "try:" in code or "try {" in code, | |
| "uses_type_hints": (": " in code and " -> " in code) or ": str" in code or ": int" in code, | |
| "no_print_stmts": "print(" not in code, | |
| "uses_docstrings": '"""' in code or "'''" in code, | |
| "language": ext.lstrip("."), | |
| } | |
| return { | |
| "status": "ok", | |
| "functions": functions, | |
| "imports": imports, | |
| "conventions": conventions, | |
| "created_at_step": step, | |
| "language": ext.lstrip("."), | |
| } | |
| def _get_ext(filename: str) -> str: | |
| if "." in filename: | |
| return "." + filename.rsplit(".", 1)[-1].lower() | |
| return ".py" | |