File size: 4,100 Bytes
6835659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import json
import re
from typing import Any


def _strip_code_fences(text: str) -> str:
    """
    Removes markdown code fences like ```json ... ``` or ``` ... ```.
    """
    text = text.strip()
    if text.startswith("```"):
        first_newline = text.find("\n")
        if first_newline != -1:
            text = text[first_newline + 1 :]
        if text.rstrip().endswith("```"):
            text = text.rstrip()[:-3]
    return text.strip()


def _extract_first_json_object(text: str) -> str | None:
    """
    Extracts the first valid JSON object substring using brace counting.
    Works even if additional text exists after JSON.
    """
    start = text.find("{")
    if start == -1:
        return None

    depth = 0
    in_str = False
    escape = False

    for idx in range(start, len(text)):
        ch = text[idx]

        if in_str:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_str = False
            continue

        if ch == '"':
            in_str = True
            continue

        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                return text[start : idx + 1]

    return text[start:]


def _close_open_braces(text: str) -> str:
    """
    If JSON is truncated, add missing closing braces.
    """
    open_braces = text.count("{")
    close_braces = text.count("}")
    if close_braces < open_braces:
        text = text + ("}" * (open_braces - close_braces))
    return text


def _remove_trailing_commas(text: str) -> str:
    """
    Removes trailing commas before closing ] or }
    """
    return re.sub(r",\s*([}\]])", r"\1", text)


def _truncate_to_last_safe_boundary(text: str) -> str | None:
    """
    Truncates to the last comma outside of strings to drop incomplete tail data.
    Also handles cases where we're in the middle of a field value.
    """
    depth = 0
    in_str = False
    escape = False
    last_cut = None
    last_colon = None

    for idx, ch in enumerate(text):
        if in_str:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_str = False
            continue

        if ch == '"':
            in_str = True
            continue

        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
        elif ch == ":" and depth >= 1:
            last_colon = idx
        elif ch == "," and depth >= 1:
            last_cut = idx

    # If we found a comma, use that
    if last_cut is not None:
        return text[:last_cut]
    
    # If we found a colon but no comma, try truncating after the colon's value
    # This handles cases like "ligh" where we're mid-field
    if last_colon is not None:
        # Find the end of the current line or next quote
        rest = text[last_colon:]
        # Try to find end of current value
        for i, c in enumerate(rest[1:], 1):
            if c in ['\n', ',', '}']:
                return text[:last_colon + i]
    
    return None


def try_repair_json(text: str) -> dict[str, Any] | None:
    """
    Attempts to recover JSON from LLM output:
    - Strips code fences
    - Extracts first JSON object using brace counting
    - Repairs missing closing braces
    - Tries json.loads()
    """
    if not text:
        return None

    text = _strip_code_fences(text)

    candidate = _extract_first_json_object(text)
    if candidate is None:
        return None

    candidate = _close_open_braces(candidate)
    candidate = _remove_trailing_commas(candidate)

    try:
        return json.loads(candidate)
    except Exception:
        pass

    truncated = _truncate_to_last_safe_boundary(candidate)
    if truncated:
        truncated = _close_open_braces(truncated)
        truncated = _remove_trailing_commas(truncated)
        try:
            return json.loads(truncated)
        except Exception:
            return None

    return None