/** * src/utils/jsonCleaner.ts * * LLM responses almost never contain bare JSON. They typically: * - Wrap JSON in ```json … ``` code fences * - Prepend sentences like "Here is the JSON:" * - Mix prose before/after the JSON block * - Contain nested braces inside string literals * * This module extracts the first valid JSON object `{}` or array `[]` * from any arbitrary LLM response string using a character-level scan * that correctly handles strings, escapes, and nesting. */ // ─── Public API ─────────────────────────────────────────────────────────────── /** * Extracts and parses the first JSON object or array from an LLM response. * * @param raw - The full LLM response text * @returns The parsed value * @throws Error if no valid JSON can be extracted */ export function extractJson(raw: string): T { if (!raw || typeof raw !== 'string') { throw new Error('extractJson: input must be a non-empty string'); } // ── Strategy 1: Strip a markdown code fence if present ────────────────── const fenceMatch = raw.match( /```(?:json|yaml|js|javascript|typescript|dart)?\s*([\s\S]*?)```/i ); if (fenceMatch) { const inner = fenceMatch[1].trim(); try { return JSON.parse(inner) as T; } catch { // Fenced block wasn't valid JSON — fall through } } // ── Strategy 2: Find first `{` or `[` and matching close bracket ──────── const objStart = raw.indexOf('{'); const arrStart = raw.indexOf('['); // Determine which comes first let start: number; let openChar: '{' | '['; let closeChar: '}' | ']'; if (objStart === -1 && arrStart === -1) { throw new Error( `extractJson: no JSON object or array found in response.\n` + `Response preview: ${raw.slice(0, 200)}` ); } if (arrStart === -1 || (objStart !== -1 && objStart < arrStart)) { start = objStart; openChar = '{'; closeChar = '}'; } else { start = arrStart; openChar = '['; closeChar = ']'; } // Character-level scan: track depth, skip string contents let depth = 0; let inString = false; let escape = false; let end = -1; for (let i = start; i < raw.length; i++) { const ch = raw[i]; if (escape) { escape = false; continue; } if (inString) { if (ch === '\\') escape = true; else if (ch === '"') inString = false; continue; } // Not in a string if (ch === '"') { inString = true; continue; } if (ch === openChar) { depth++; continue; } if (ch === closeChar) { depth--; if (depth === 0) { end = i; break; } } } if (end === -1) { throw new Error( `extractJson: unbalanced brackets in LLM response.\n` + `Response preview: ${raw.slice(start, start + 300)}` ); } const candidate = raw.slice(start, end + 1); try { return JSON.parse(candidate) as T; } catch (parseErr) { // Last resort: attempt to fix common LLM JSON mistakes const repaired = attemptRepair(candidate); if (repaired !== null) return repaired as T; throw new Error( `extractJson: JSON.parse failed.\n` + `Error: ${String(parseErr)}\n` + `Candidate (first 300 chars): ${candidate.slice(0, 300)}` ); } } /** * Non-throwing variant of extractJson. * Returns `null` if extraction or parsing fails. */ export function tryExtractJson(raw: string): T | null { try { return extractJson(raw); } catch { return null; } } // ─── Private Helpers ────────────────────────────────────────────────────────── /** * Attempts common repairs to malformed LLM JSON: * - Trailing commas before `}` or `]` * - Single-quoted strings * - Unquoted property keys (limited cases) */ function attemptRepair(json: string): unknown | null { let repaired = json; // Remove trailing commas in objects/arrays repaired = repaired.replace(/,\s*([}\]])/g, '$1'); // Replace single-quoted strings with double-quoted repaired = repaired.replace(/'/g, '"'); try { return JSON.parse(repaired); } catch { return null; } }