import json import re from typing import Any FALLBACK_ACTION = json.dumps({"action": "submit_answer", "answer": "unknown"}) def _sanitize_string_value(match: re.Match) -> str: """ Receives a regex match of ("key": "value") and cleans only the value part. Escapes unescaped newlines, tabs, carriage returns, and inner double quotes. NOTE: This is the core trick LangChain uses in _replace_new_line / _custom_parser. """ opening = match.group(1) value = match.group(2) closing = match.group(3) value = re.sub(r"\n", r"\\n", value) value = re.sub(r"\r", r"\\r", value) value = re.sub(r"\t", r"\\t", value) value = re.sub(r'(? str: """ Apply _sanitize_string_value to every JSON string value in the text. Uses re.DOTALL so values that span multiple lines are handled correctly. NOTE: Generalised version of LangChain's _custom_parser (which only targeted action_input). """ return re.sub( r'("[\w]+"\s*:\s*")(.*?)(")', _sanitize_string_value, text, flags=re.DOTALL, ) def _preprocess(text: str) -> str: """Fix common LLM response quirks before attempting JSON parsing.""" # Strip markdown code fences (```json ... ``` or ``` ... ```) match = re.search(r"```(?:json)?\s*(.*?)```", text, re.DOTALL) if match: text = match.group(1).strip() # Double curly braces {{"k": "v"}} → {"k": "v"} text = text.replace("{{", "{").replace("}}", "}") text = re.sub(r"\bTrue\b", "true", text) text = re.sub(r"\bFalse\b", "false", text) text = re.sub(r"\bNone\b", "null", text) text = re.sub(r",\s*([}\]])", r"\1", text) # Outer single-quote wrap '{"k": "v"}' → {"k": "v"} if text.startswith("'") and text.endswith("'"): text = text[1:-1].replace("\\'", "'") return text.strip() def _extract_json_blob(text: str) -> str: """ Pull out the first {...} or [...] blob from text that has prose around it. Inspired by LangChain's _json_markdown_re fallback in parse_json_markdown. """ match = re.search(r"(\{.*\}|\[.*\])", text, re.DOTALL) return match.group(1) if match else text def _parse_partial_json(s: str) -> Any: """ Parse JSON that may be truncated / missing closing brackets. Adapted from LangChain's parse_partial_json (originally from open-interpreter). Uses a stack to track open containers and closes them before parsing. """ s = s.strip() try: return json.loads(s) except json.JSONDecodeError: pass stack = [] is_inside = False position = 0 for i, char in enumerate(s): if is_inside: if char == '"' and s[i - 1] != "\\": is_inside = False else: if char == '"': is_inside = True stack.append('"') elif char in "{[": stack.append(char) elif char in "}]": if stack and stack[-1] in "{[": stack.pop() position = i completed = s[: position + 1] for bracket in reversed(stack): if bracket == '"': completed += '"' elif bracket == "{": completed += "}" elif bracket == "[": completed += "]" return json.loads(completed) def _extract_fields_direct(text: str) -> dict: """Extract action fields using greedy regex anchored to the last closing quote. Handles the case where the model emits unescaped double-quote characters inside a "code" or "answer" value (e.g. df["col"]). The non-greedy `(.*?)` in _sanitize_all_string_values stops at the *first* inner quote and corrupts the output. By using a greedy `(.*)` anchored with a lookahead for the last `"}` boundary we capture the full value regardless of inner quotes. Args: text: Pre-processed JSON-like string. Returns: Dict with 'action' and 'code'/'answer' keys. Raises: ValueError: If the action field cannot be found or the value cannot be extracted for the detected action type. """ action_match = re.search(r'"action"\s*:\s*"(\w+)"', text) if not action_match: raise ValueError("No 'action' field found") action_type = action_match.group(1) if action_type == "execute_code": m = re.search(r'"code"\s*:\s*"(.*)"(?=\s*})', text, re.DOTALL) if m: return {"action": "execute_code", "code": m.group(1)} elif action_type == "submit_answer": m = re.search(r'"answer"\s*:\s*"(.*)"(?=\s*})', text, re.DOTALL) if m: return {"action": "submit_answer", "answer": m.group(1)} raise ValueError(f"Could not extract value for action_type={action_type!r}") def parse_model_action(response_text: str) -> dict: """ Parse a raw LLM response into an action dict. Pipeline (mirrors LangChain's JsonOutputParser internals): 1. _preprocess – fix markdown fences, double braces, Python literals … 2. _sanitize_all_string_values – escape unescaped quotes/newlines inside values 3. _extract_json_blob – strip surrounding prose 4. _parse_partial_json – close truncated JSON with a stack algorithm Each strategy is tried independently so a failure in one doesn't block others. """ text = response_text.strip() strategies = [ lambda t: _parse_partial_json(t), lambda t: _parse_partial_json(_sanitize_all_string_values(_preprocess(t))), lambda t: _parse_partial_json(_sanitize_all_string_values(_preprocess(_extract_json_blob(t)))), lambda t: _parse_partial_json(_sanitize_all_string_values(_extract_json_blob(_preprocess(t)))), lambda t: _parse_partial_json(_sanitize_all_string_values(t)), lambda t: _extract_fields_direct(_preprocess(_extract_json_blob(t))), lambda t: _extract_fields_direct(_extract_json_blob(t)), ] for strategy in strategies: try: return strategy(text) except (json.JSONDecodeError, ValueError): continue print(f"JSON Decoding Error while parsing action in response text: {response_text}") return json.loads(FALLBACK_ACTION)