Openenv / codegraph /graph.py
vishaldhakad's picture
intial push
eda351c
Raw
History Blame Contribute Delete
4.58 kB
"""
codegraph/graph.py β€” CodeGraph V2
The innovation that makes SecureCodeEnv unique.
Structured in-memory database of everything the agent has written this episode.
Persisted in Redis between steps via pickle.
V2 changes:
- tree-sitter replaces ast module β†’ supports Python, JS, TS, TSX
- 60% threshold for style detection (was 50%) β†’ prevents false penalties
- "mixed" state added β†’ no penalty when codebase has no clear dominant style
- compress_graph() added β†’ semantic compression for inference context
"""
from dataclasses import dataclass, field
from collections import Counter
from typing import Dict, Any
@dataclass
class CodeGraph:
episode_seed: int = 0
components: Dict[str, Dict[str, Any]] = field(default_factory=dict)
conventions: Dict[str, Any] = field(default_factory=dict)
def update(self, filename: str, metadata: Dict[str, Any]) -> None:
"""Add or replace a file's metadata in the graph, then re-derive conventions."""
if metadata.get("status") == "syntax_error":
return # Don't pollute graph with broken code
name = _file_to_key(filename)
metadata["file"] = filename
self.components[name] = metadata
self._infer_conventions()
def _infer_conventions(self) -> None:
"""
Derive dominant codebase style from all components.
60% threshold: a bare majority (51%) wrongly penalises mixed codebases.
When no clear style β†’ 'mixed' β†’ consistency grader awards full marks.
"""
all_fns = [
f["name"]
for comp in self.components.values()
for f in comp.get("functions", [])
]
if all_fns:
styles = [_naming_style(n) for n in all_fns]
top, count = Counter(styles).most_common(1)[0]
self.conventions["naming"] = top if count / len(styles) >= 0.60 else "mixed"
else:
self.conventions["naming"] = "unknown"
uses_try = sum(
1 for c in self.components.values()
if c.get("conventions", {}).get("uses_try_catch", False)
)
total = len(self.components)
self.conventions["error_handling"] = "try_catch" if uses_try / max(total, 1) >= 0.5 else "none"
uses_hints = sum(
1 for c in self.components.values()
if c.get("conventions", {}).get("uses_type_hints", False)
)
self.conventions["uses_type_hints"] = uses_hints / max(total, 1) >= 0.5
def to_slim_dict(self, limit: int = 6000) -> str:
"""
compress_graph() β€” semantic compression for inference.py context.
Keeps signatures + conventions, drops function bodies.
V1 blindly truncated at 2000 chars β†’ agents couldn't see patterns they needed.
"""
import json
slim = {
"conventions": self.conventions,
"components": {
name: {
"file": comp.get("file", ""),
"language": comp.get("language", "py"),
"functions": [f["name"] for f in comp.get("functions", [])][:20],
"imports": [i.split(".")[0] for i in comp.get("imports", [])][:15],
"uses_try_catch": comp.get("conventions", {}).get("uses_try_catch", False),
"uses_type_hints": comp.get("conventions", {}).get("uses_type_hints", False),
}
for name, comp in self.components.items()
},
}
result = json.dumps(slim, indent=2)
if len(result) > limit:
# Further trim: drop imports when still over limit
for name in slim["components"]:
slim["components"][name].pop("imports", None)
result = json.dumps(slim, indent=2)[:limit]
return result
# ── helpers ──────────────────────────────────────────────────────────────────
def _file_to_key(filename: str) -> str:
"""Convert 'src/auth/UserAuth.py' β†’ 'UserAuth'"""
base = filename.split("/")[-1]
for ext in (".py", ".js", ".ts", ".tsx", ".jsx"):
base = base.replace(ext, "")
return base
def _naming_style(name: str) -> str:
if "_" in name:
return "snake_case"
if name and name[0].isupper():
return "PascalCase"
if any(c.isupper() for c in name[1:]):
return "camelCase"
return "snake_case" # all-lowercase defaults to snake