File size: 11,412 Bytes
66227af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
import json
import logging
import re
from typing import Any

from json_repair import repair_json

logger = logging.getLogger(__name__)
# logging.getLogger("sqlalchemy.engine.Engine").disabled = True


def comprehensive_json_repair(json_str: str) -> str:
    """Comprehensively repair malformed JSON with multiple strategies"""

    # Strategy 1: Handle truncated JSON by parsing what we can
    repaired = try_partial_parse_repair(json_str)
    if repaired:
        return repaired

    # Strategy 2: Smart bracket/brace matching with context awareness
    repaired = try_contextual_closure_repair(json_str)
    if repaired:
        return repaired

    # Strategy 3: Line-by-line reconstruction
    repaired = try_line_reconstruction_repair(json_str)
    if repaired:
        return repaired

    # Strategy 4: Regex-based common pattern fixes
    repaired = try_regex_pattern_repair(json_str)
    if repaired:
        return repaired

    # Fallback: Original simple method
    return simple_bracket_repair(json_str)


def try_partial_parse_repair(json_str: str) -> str | None:
    """Try to parse JSON incrementally and reconstruct from valid parts"""
    try:
        # First, try to find the last complete object/array
        lines = json_str.split("\n")

        for i in range(len(lines), 0, -1):
            partial = "\n".join(lines[:i])

            # Try different closure strategies
            for closure_attempt in generate_closure_attempts(partial):
                try:
                    json.loads(closure_attempt)
                    return closure_attempt
                except json.JSONDecodeError:
                    continue

        return None
    except Exception:
        return None


def generate_closure_attempts(partial_json: str) -> list[str]:
    """Generate different ways to close the JSON structure"""
    attempts: list[str] = []

    # Analyze the structure to understand what's open
    stack: list[tuple[str, int]] = []
    in_string = False
    escape_next = False

    for i, char in enumerate(partial_json):
        if escape_next:
            escape_next = False
            continue

        if char == "\\":
            escape_next = True
            continue

        if char == '"' and not escape_next:
            in_string = not in_string
            continue

        if in_string:
            continue

        if char in "({[":
            stack.append((char, i))
        elif char in ")}]" and stack:
            opener, _ = stack.pop()
            # Verify matching pairs
            if not (
                (char == ")" and opener == "(")
                or (char == "}" and opener == "{")
                or (char == "]" and opener == "[")
            ):
                # Mismatched - this is likely where corruption started
                break

    # Generate closure attempts based on what's still open
    base = partial_json.rstrip()

    # Remove trailing comma if present
    if base.rstrip().endswith(","):
        base = base[:-1]
        attempts.append(base)

    # Close based on stack
    closures: list[str] = []
    for opener, _ in reversed(stack):
        if opener == "{":
            closures.append("}")
        elif opener == "[":
            closures.append("]")
        elif opener == "(":
            closures.append(")")

    # Try different combinations
    attempts.append(base + "".join(closures))

    # Try closing just objects/arrays (ignore parentheses)
    obj_closures = [c for c in closures if c in "]}"]
    attempts.append(base + "".join(obj_closures))

    # Try adding missing quotes if we're in a string
    if in_string:
        attempts.append(base + '"' + "".join(closures))

    return attempts


def try_contextual_closure_repair(json_str: str) -> str | None:
    """Smart closure repair based on JSON context"""
    try:
        # Find the last valid JSON token
        tokens: list[dict[str, Any]] = tokenize_json(json_str)

        # Look for patterns that indicate what should come next
        if not tokens:
            return None

        last_token: dict[str, Any] = tokens[-1]

        # If last token is a value, we might need to close objects/arrays
        if last_token["type"] in ["string", "number", "boolean", "null"]:
            return try_close_after_value(json_str, tokens)

        # If last token is a structural element, handle appropriately
        elif last_token["type"] in ["comma", "colon"]:
            return try_complete_structure(json_str, tokens)

        return None
    except Exception:
        return None


def tokenize_json(json_str: str) -> list[dict[str, Any]]:
    """Tokenize JSON string into meaningful components"""
    tokens: list[dict[str, Any]] = []
    i = 0

    while i < len(json_str):
        char = json_str[i]

        # Skip whitespace
        if char.isspace():
            i += 1
            continue

        # String literals
        if char == '"':
            start = i
            i += 1
            while i < len(json_str):
                if json_str[i] == '"' and json_str[i - 1] != "\\":
                    break
                i += 1
            tokens.append(
                {
                    "type": "string",
                    "value": json_str[start : i + 1],
                    "start": start,
                    "end": i,
                }
            )

        # Numbers
        elif char.isdigit() or char == "-":
            start = i
            while i < len(json_str) and (
                json_str[i].isdigit() or json_str[i] in ".-eE"
            ):
                i += 1
            tokens.append(
                {
                    "type": "number",
                    "value": json_str[start:i],
                    "start": start,
                    "end": i - 1,
                }
            )
            continue  # Don't increment i again

        # Structural characters
        elif char in "{}[],:":
            token_type = {
                "{": "object_start",
                "}": "object_end",
                "[": "array_start",
                "]": "array_end",
                ",": "comma",
                ":": "colon",
            }[char]

            tokens.append({"type": token_type, "value": char, "start": i, "end": i})

        # Boolean/null literals
        elif char in "tfn":
            if json_str[i : i + 4] == "true":
                tokens.append(
                    {"type": "boolean", "value": "true", "start": i, "end": i + 3}
                )
                i += 3
            elif json_str[i : i + 5] == "false":
                tokens.append(
                    {"type": "boolean", "value": "false", "start": i, "end": i + 4}
                )
                i += 4
            elif json_str[i : i + 4] == "null":
                tokens.append(
                    {"type": "null", "value": "null", "start": i, "end": i + 3}
                )
                i += 3

        i += 1

    return tokens


def try_close_after_value(json_str: str, tokens: list[dict[str, Any]]) -> str | None:
    """Try to close JSON after a value token"""
    # Analyze nesting to determine what needs to be closed
    nesting_stack: list[str] = []

    for token in tokens[:-1]:  # Exclude the last token (which is the value)
        if token["type"] == "object_start":
            nesting_stack.append("}")
        elif token["type"] == "array_start":
            nesting_stack.append("]")
        elif (
            token["type"] in ["object_end", "array_end"]
            and nesting_stack
            and nesting_stack[-1] == token["value"]
        ):
            nesting_stack.pop()

    # Close remaining open structures
    closure = "".join(reversed(nesting_stack))
    candidate = json_str + closure

    try:
        json.loads(candidate)
        return candidate
    except json.JSONDecodeError:
        return None


def try_complete_structure(json_str: str, tokens: list[dict[str, Any]]) -> str | None:
    """Try to complete JSON ending with structural tokens like comma or colon"""
    last_token = tokens[-1]

    if last_token["type"] == "comma":
        # After comma, we might be missing a key-value pair or array element
        # Try removing the trailing comma first
        trimmed = json_str.rstrip().rstrip(",")
        return try_contextual_closure_repair(trimmed)

    elif last_token["type"] == "colon":
        # After colon, we're missing a value - try adding a placeholder
        candidates = [
            json_str + "null",
            json_str + '""',
            json_str + "[]",
            json_str + "{}",
        ]

        for candidate in candidates:
            try:
                repaired = try_contextual_closure_repair(candidate)
                if repaired:
                    return repaired
            except (json.JSONDecodeError, TypeError, ValueError):
                continue

    return None


def try_line_reconstruction_repair(json_str: str) -> str | None:
    """Try to reconstruct JSON line by line"""
    lines = json_str.split("\n")

    # Find the last line that makes the JSON valid when truncated there
    for i in range(len(lines), 0, -1):
        partial_lines = lines[:i]
        partial_json = "\n".join(partial_lines)

        # Try to repair this partial JSON
        repaired = try_contextual_closure_repair(partial_json)
        if repaired:
            return repaired

    return None


def try_regex_pattern_repair(json_str: str) -> str | None:
    """Use regex to fix common JSON formatting issues"""
    fixed = json_str

    # Remove trailing commas before closing braces/brackets
    fixed = re.sub(r",(\s*[}\]])", r"\1", fixed)

    # Fix unescaped quotes in strings (basic attempt)
    fixed = re.sub(r'(?<!\\)"(?![,\]\}:\s]|$)', r'\\"', fixed)

    # Remove incomplete key-value pairs at the end
    fixed = re.sub(r',\s*"[^"]*"?\s*:?\s*$', "", fixed)

    # Try to parse the fixed version
    try:
        json.loads(fixed)
        return fixed
    except json.JSONDecodeError:
        pass

    # If that didn't work, try closing it
    return try_contextual_closure_repair(fixed)


def simple_bracket_repair(json_str: str) -> str:
    """Fallback: Original simple bracket counting method"""
    open_braces = json_str.count("{")
    close_braces = json_str.count("}")
    open_brackets = json_str.count("[")
    close_brackets = json_str.count("]")

    missing_brackets = open_brackets - close_brackets
    missing_braces = open_braces - close_braces

    repaired = json_str
    repaired += "]" * max(0, missing_brackets)
    repaired += "}" * max(0, missing_braces)

    return repaired


def validate_and_repair_json(json_str: str) -> str:
    """Main function with comprehensive repair strategies"""
    json_str = json_str.strip()

    # Try parsing with repair library
    good_json = repair_json(json_str)
    if good_json:
        return good_json

    # Try comprehensive repair
    try:
        repaired = comprehensive_json_repair(json_str)

        # Validate the repair
        json.loads(repaired)
        logger.info("✅ JSON successfully repaired!")
        return repaired

    except json.JSONDecodeError as repair_error:
        logger.error(f"❌ Repair failed: {repair_error}")
        raise ValueError(
            f"Could not repair JSON. Original error: {repair_error.msg}, "
            + f"Repair error: {repair_error.msg}"
        ) from repair_error