| """Robust JSON extraction from LLM responses. |
| |
| Handles common LLM output artifacts: <think> tags, markdown fences, |
| preamble text, and truncated responses. |
| """ |
|
|
| import json |
| import logging |
| import re |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def extract_json(text: str) -> dict: |
| """Extract JSON from LLM response that may contain artifacts or truncation. |
| |
| Strategies (in order): |
| 1. Strip <think> tags and markdown fences, try direct parse. |
| 2. Depth-tracking brace scan for complete JSON objects. |
| 3. Truncation repair — close unclosed braces and retry. |
| """ |
| if not text: |
| raise ValueError("Empty response from LLM") |
|
|
| cleaned = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip() |
| |
| cleaned = re.sub(r"```(?:json)?\s*\n?", "", cleaned).strip() |
| cleaned = cleaned.rstrip("`").strip() |
|
|
| |
| try: |
| return json.loads(cleaned) |
| except json.JSONDecodeError: |
| pass |
|
|
| |
| depth = 0 |
| start = None |
| in_string = False |
| escape_next = False |
|
|
| for i, ch in enumerate(cleaned): |
| if escape_next: |
| escape_next = False |
| continue |
| if ch == "\\": |
| if in_string: |
| escape_next = True |
| continue |
| if ch == '"' and not escape_next: |
| in_string = not in_string |
| continue |
| if in_string: |
| continue |
| if ch == "{": |
| if depth == 0: |
| start = i |
| depth += 1 |
| elif ch == "}": |
| depth -= 1 |
| if depth == 0 and start is not None: |
| try: |
| return json.loads(cleaned[start : i + 1]) |
| except json.JSONDecodeError: |
| start = None |
|
|
| |
| if depth > 0 and start is not None: |
| fragment = cleaned[start:] |
| |
| repaired = fragment + '"' * (fragment.count('"') % 2) + "}" * depth |
| try: |
| result = json.loads(repaired) |
| logger.warning( |
| "Repaired truncated JSON (closed %d brace(s)): %s...", |
| depth, |
| fragment[:80], |
| ) |
| return result |
| except json.JSONDecodeError: |
| pass |
|
|
| raise ValueError(f"No valid JSON found in LLM response: {text[:200]}") |
|
|