import json import re from typing import Any def _strip_code_fences(text: str) -> str: """ Removes markdown code fences like ```json ... ``` or ``` ... ```. """ text = text.strip() if text.startswith("```"): first_newline = text.find("\n") if first_newline != -1: text = text[first_newline + 1 :] if text.rstrip().endswith("```"): text = text.rstrip()[:-3] return text.strip() def _extract_first_json_object(text: str) -> str | None: """ Extracts the first valid JSON object substring using brace counting. Works even if additional text exists after JSON. """ start = text.find("{") if start == -1: return None depth = 0 in_str = False escape = False for idx in range(start, len(text)): ch = text[idx] if in_str: if escape: escape = False elif ch == "\\": escape = True elif ch == '"': in_str = False continue if ch == '"': in_str = True continue if ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: return text[start : idx + 1] return text[start:] def _close_open_braces(text: str) -> str: """ If JSON is truncated, add missing closing braces. """ open_braces = text.count("{") close_braces = text.count("}") if close_braces < open_braces: text = text + ("}" * (open_braces - close_braces)) return text def _remove_trailing_commas(text: str) -> str: """ Removes trailing commas before closing ] or } """ return re.sub(r",\s*([}\]])", r"\1", text) def _truncate_to_last_safe_boundary(text: str) -> str | None: """ Truncates to the last comma outside of strings to drop incomplete tail data. Also handles cases where we're in the middle of a field value. """ depth = 0 in_str = False escape = False last_cut = None last_colon = None for idx, ch in enumerate(text): if in_str: if escape: escape = False elif ch == "\\": escape = True elif ch == '"': in_str = False continue if ch == '"': in_str = True continue if ch == "{": depth += 1 elif ch == "}": depth -= 1 elif ch == ":" and depth >= 1: last_colon = idx elif ch == "," and depth >= 1: last_cut = idx # If we found a comma, use that if last_cut is not None: return text[:last_cut] # If we found a colon but no comma, try truncating after the colon's value # This handles cases like "ligh" where we're mid-field if last_colon is not None: # Find the end of the current line or next quote rest = text[last_colon:] # Try to find end of current value for i, c in enumerate(rest[1:], 1): if c in ['\n', ',', '}']: return text[:last_colon + i] return None def try_repair_json(text: str) -> dict[str, Any] | None: """ Attempts to recover JSON from LLM output: - Strips code fences - Extracts first JSON object using brace counting - Repairs missing closing braces - Tries json.loads() """ if not text: return None text = _strip_code_fences(text) candidate = _extract_first_json_object(text) if candidate is None: return None candidate = _close_open_braces(candidate) candidate = _remove_trailing_commas(candidate) try: return json.loads(candidate) except Exception: pass truncated = _truncate_to_last_safe_boundary(candidate) if truncated: truncated = _close_open_braces(truncated) truncated = _remove_trailing_commas(truncated) try: return json.loads(truncated) except Exception: return None return None