File size: 4,100 Bytes
6835659 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | import json
import re
from typing import Any
def _strip_code_fences(text: str) -> str:
"""
Removes markdown code fences like ```json ... ``` or ``` ... ```.
"""
text = text.strip()
if text.startswith("```"):
first_newline = text.find("\n")
if first_newline != -1:
text = text[first_newline + 1 :]
if text.rstrip().endswith("```"):
text = text.rstrip()[:-3]
return text.strip()
def _extract_first_json_object(text: str) -> str | None:
"""
Extracts the first valid JSON object substring using brace counting.
Works even if additional text exists after JSON.
"""
start = text.find("{")
if start == -1:
return None
depth = 0
in_str = False
escape = False
for idx in range(start, len(text)):
ch = text[idx]
if in_str:
if escape:
escape = False
elif ch == "\\":
escape = True
elif ch == '"':
in_str = False
continue
if ch == '"':
in_str = True
continue
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
return text[start : idx + 1]
return text[start:]
def _close_open_braces(text: str) -> str:
"""
If JSON is truncated, add missing closing braces.
"""
open_braces = text.count("{")
close_braces = text.count("}")
if close_braces < open_braces:
text = text + ("}" * (open_braces - close_braces))
return text
def _remove_trailing_commas(text: str) -> str:
"""
Removes trailing commas before closing ] or }
"""
return re.sub(r",\s*([}\]])", r"\1", text)
def _truncate_to_last_safe_boundary(text: str) -> str | None:
"""
Truncates to the last comma outside of strings to drop incomplete tail data.
Also handles cases where we're in the middle of a field value.
"""
depth = 0
in_str = False
escape = False
last_cut = None
last_colon = None
for idx, ch in enumerate(text):
if in_str:
if escape:
escape = False
elif ch == "\\":
escape = True
elif ch == '"':
in_str = False
continue
if ch == '"':
in_str = True
continue
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
elif ch == ":" and depth >= 1:
last_colon = idx
elif ch == "," and depth >= 1:
last_cut = idx
# If we found a comma, use that
if last_cut is not None:
return text[:last_cut]
# If we found a colon but no comma, try truncating after the colon's value
# This handles cases like "ligh" where we're mid-field
if last_colon is not None:
# Find the end of the current line or next quote
rest = text[last_colon:]
# Try to find end of current value
for i, c in enumerate(rest[1:], 1):
if c in ['\n', ',', '}']:
return text[:last_colon + i]
return None
def try_repair_json(text: str) -> dict[str, Any] | None:
"""
Attempts to recover JSON from LLM output:
- Strips code fences
- Extracts first JSON object using brace counting
- Repairs missing closing braces
- Tries json.loads()
"""
if not text:
return None
text = _strip_code_fences(text)
candidate = _extract_first_json_object(text)
if candidate is None:
return None
candidate = _close_open_braces(candidate)
candidate = _remove_trailing_commas(candidate)
try:
return json.loads(candidate)
except Exception:
pass
truncated = _truncate_to_last_safe_boundary(candidate)
if truncated:
truncated = _close_open_braces(truncated)
truncated = _remove_trailing_commas(truncated)
try:
return json.loads(truncated)
except Exception:
return None
return None
|