openenv-data-clean / server /action_parser.py
Tarkeshwar
Restructure repo to match OpenEnv standard layout
7c6fd7d
Raw
History Blame Contribute Delete
3.09 kB
"""Robust parser for data cleaning commands."""
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class ParsedAction:
command_type: str # "inspect", "fix", "delete", "submit", "error"
args: dict
error_message: Optional[str] = None
# Strip markdown code fences and leading "action:" prefixes
_PREFIX_RE = re.compile(
r"^(?:```\w*\s*\n?|action\s*[:\-]\s*|next\s*action\s*[:\-]\s*)",
re.IGNORECASE,
)
_SUFFIX_RE = re.compile(r"\s*```\s*$")
def _strip_quotes(s: str) -> str:
s = s.strip()
if len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'):
return s[1:-1]
return s
def parse_action(raw: str) -> ParsedAction:
"""Parse a raw command string into a structured ParsedAction."""
if not raw or not raw.strip():
return ParsedAction("error", {}, "Empty command. Use inspect/fix/delete/submit.")
text = raw.strip()
text = _PREFIX_RE.sub("", text)
text = _SUFFIX_RE.sub("", text)
text = text.strip()
# Try each command pattern
for parser in [_parse_submit, _parse_inspect, _parse_delete, _parse_fix]:
result = parser(text)
if result is not None:
return result
return ParsedAction(
"error",
{},
f"Could not parse: '{raw.strip()[:80]}'. "
"Expected: inspect(\"col\"), fix(row, \"col\", \"val\"), delete(row), or submit()",
)
def _parse_submit(text: str) -> Optional[ParsedAction]:
if re.match(r"^submit\s*(\(\s*\))?\s*$", text, re.IGNORECASE):
return ParsedAction("submit", {})
return None
def _parse_inspect(text: str) -> Optional[ParsedAction]:
m = re.match(
r'^inspect\s*\(\s*(["\']?)(\w+)\1\s*\)$', text, re.IGNORECASE
)
if m:
return ParsedAction("inspect", {"column": m.group(2)})
return None
def _parse_delete(text: str) -> Optional[ParsedAction]:
m = re.match(r"^delete\s*\(\s*(\d+)\s*\)$", text, re.IGNORECASE)
if m:
return ParsedAction("delete", {"row": int(m.group(1))})
return None
def _parse_fix(text: str) -> Optional[ParsedAction]:
# fix(row, "column", "value") — value may contain commas, quotes, parens
# Strategy: match the row and column greedily, then take everything else as value
m = re.match(
r'^fix\s*\(\s*(\d+)\s*,\s*(["\']?)(\w+)\2\s*,\s*(.+)\)$',
text,
re.IGNORECASE | re.DOTALL,
)
if m:
row = int(m.group(1))
column = m.group(3)
value = _strip_quotes(m.group(4).strip())
return ParsedAction("fix", {"row": row, "column": column, "value": value})
# Fallback: more permissive pattern for LLMs that format differently
m = re.match(
r'^fix\s*\(\s*row\s*=\s*(\d+)\s*,\s*(?:column|col)\s*=\s*(["\']?)(\w+)\2\s*,\s*(?:value|val)\s*=\s*(.+)\)$',
text,
re.IGNORECASE | re.DOTALL,
)
if m:
row = int(m.group(1))
column = m.group(3)
value = _strip_quotes(m.group(4).strip())
return ParsedAction("fix", {"row": row, "column": column, "value": value})
return None