Openenv / codegraph /extractor.py
vishaldhakad's picture
intial push
eda351c
Raw
History Blame Contribute Delete
5.2 kB
"""
codegraph/extractor.py β€” V2 Multi-language metadata extractor.
V1 used Python's ast module β†’ Python-only, returned empty object on SyntaxError.
V2 uses tree-sitter β†’ Python + JS + TS + TSX with same API.
V2 also returns structured SyntaxError with line + message β†’ agent can fix it.
tree-sitter is error-tolerant: returns a partial parse tree even for broken code,
so we always get *some* metadata even from syntactically broken submissions.
"""
import ast as pyast
from typing import Dict, Any
# ── tree-sitter setup ─────────────────────────────────────────────────────────
_PARSERS: Dict[str, Any] = {}
def _get_parser(ext: str):
"""Lazy-load language parser. Falls back to Python if grammar unavailable."""
global _PARSERS
if ext in _PARSERS:
return _PARSERS[ext]
try:
from tree_sitter import Language, Parser
if ext in (".py",):
import tree_sitter_python as tspython
lang = Language(tspython.language())
elif ext in (".js", ".ts", ".tsx", ".jsx"):
import tree_sitter_javascript as tsjavascript
lang = Language(tsjavascript.language())
else:
import tree_sitter_python as tspython
lang = Language(tspython.language())
parser = Parser(lang)
_PARSERS[ext] = parser
return parser
except Exception:
# tree-sitter not installed β†’ signal caller to use ast-only path
_PARSERS[ext] = None
return None
def extract_metadata(code: str, filename: str, step: int) -> Dict[str, Any]:
"""
Extract structured metadata from agent code.
Returns:
dict with keys: status, functions, imports, conventions, language, created_at_step
On syntax error: status='syntax_error', error, line, col, feedback
V2 guarantee: always returns a dict, never raises.
"""
ext = _get_ext(filename)
# ── Python path: try ast for exact SyntaxError info ──────────────────────
if ext == ".py":
try:
pyast.parse(code)
except SyntaxError as e:
return {
"status": "syntax_error",
"error": str(e.msg),
"line": e.lineno,
"col": e.offset,
"feedback": f"SyntaxError line {e.lineno}: {e.msg}. Fix before grading.",
"functions": [],
"imports": [],
"conventions": {},
"created_at_step": step,
"language": "py",
}
# ── tree-sitter parse (works even on broken JS/TS) ────────────────────────
parser = _get_parser(ext)
functions, imports = [], []
if parser:
try:
tree = parser.parse(code.encode())
def walk(node):
if node.type in (
"function_definition", "function_declaration",
"arrow_function", "method_definition",
):
name_node = node.child_by_field_name("name")
if name_node:
functions.append({
"name": name_node.text.decode(),
"start_line": node.start_point[0],
})
if node.type in (
"import_statement", "import_from_statement",
"import_declaration",
):
imports.append(node.text.decode()[:120])
for child in node.children:
walk(child)
walk(tree.root_node)
except Exception:
pass # Partial results are fine
# ── Fallback: pure ast for Python when tree-sitter unavailable ───────────
if not functions and ext == ".py":
try:
tree = pyast.parse(code)
for node in pyast.walk(tree):
if isinstance(node, pyast.FunctionDef):
functions.append({"name": node.name, "start_line": node.lineno})
if isinstance(node, pyast.Import):
imports += [a.name for a in node.names]
if isinstance(node, pyast.ImportFrom) and node.module:
imports.append(node.module)
except Exception:
pass
conventions = {
"uses_try_catch": "try:" in code or "try {" in code,
"uses_type_hints": (": " in code and " -> " in code) or ": str" in code or ": int" in code,
"no_print_stmts": "print(" not in code,
"uses_docstrings": '"""' in code or "'''" in code,
"language": ext.lstrip("."),
}
return {
"status": "ok",
"functions": functions,
"imports": imports,
"conventions": conventions,
"created_at_step": step,
"language": ext.lstrip("."),
}
def _get_ext(filename: str) -> str:
if "." in filename:
return "." + filename.rsplit(".", 1)[-1].lower()
return ".py"