Spaces:
Running
Running
echoboi Claude Sonnet 4.6 commited on
Commit Β·
77b9e25
1
Parent(s): 9e4cfa3
Strip docstrings from description length (parsimony scoring)
Browse filesstrip_comments() now does a two-pass strip:
1. AST pass: removes module/class/function docstring nodes
(Expr(Constant(str)) in first-statement position)
2. Tokenize pass: removes # inline comments
Before: agent code with large docstrings was penalised vs clean code
even when the actual algorithm was identical.
After: stripped_code_length(code_with_docs) == stripped_code_length(code_clean)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- discovery_env/scoring.py +34 -2
discovery_env/scoring.py
CHANGED
|
@@ -101,10 +101,42 @@ def functional_accuracy(
|
|
| 101 |
|
| 102 |
|
| 103 |
def strip_comments(code: str) -> str:
|
| 104 |
-
"""Strip comments from Python source code.
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
try:
|
| 109 |
tokens = tokenize.generate_tokens(io.StringIO(code).readline)
|
| 110 |
result = []
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
def strip_comments(code: str) -> str:
|
| 104 |
+
"""Strip comments AND docstrings from Python source code.
|
| 105 |
|
| 106 |
+
Two-pass approach:
|
| 107 |
+
1. AST pass β removes module/class/function docstrings
|
| 108 |
+
(string-expression nodes in first-statement position).
|
| 109 |
+
These are the triple-quoted blocks that bloat agent code
|
| 110 |
+
without contributing algorithmic complexity.
|
| 111 |
+
2. Tokenize pass β removes remaining # inline comments.
|
| 112 |
+
|
| 113 |
+
Blank lines are also removed. The result is a fair proxy for
|
| 114 |
+
the algorithmic description length used in parsimony scoring.
|
| 115 |
"""
|
| 116 |
+
import ast as _ast
|
| 117 |
+
|
| 118 |
+
# ββ Pass 1: remove docstrings via AST βββββββββββββββββββββββββββββββββββββ
|
| 119 |
+
try:
|
| 120 |
+
tree = _ast.parse(code)
|
| 121 |
+
for node in _ast.walk(tree):
|
| 122 |
+
if not isinstance(node, (_ast.FunctionDef, _ast.AsyncFunctionDef,
|
| 123 |
+
_ast.ClassDef, _ast.Module)):
|
| 124 |
+
continue
|
| 125 |
+
if not node.body:
|
| 126 |
+
continue
|
| 127 |
+
first = node.body[0]
|
| 128 |
+
if (isinstance(first, _ast.Expr) and
|
| 129 |
+
isinstance(first.value, _ast.Constant) and
|
| 130 |
+
isinstance(first.value.value, str)):
|
| 131 |
+
node.body.pop(0)
|
| 132 |
+
# A body can't be empty β insert pass if needed
|
| 133 |
+
if not node.body:
|
| 134 |
+
node.body.append(_ast.Pass())
|
| 135 |
+
code = _ast.unparse(tree)
|
| 136 |
+
except Exception:
|
| 137 |
+
pass # malformed code β fall through to tokenize-only
|
| 138 |
+
|
| 139 |
+
# ββ Pass 2: remove # comments via tokenize ββββββββββββββββββββββββββββββββ
|
| 140 |
try:
|
| 141 |
tokens = tokenize.generate_tokens(io.StringIO(code).readline)
|
| 142 |
result = []
|