titan-server / src /utils /jsonCleaner.ts
M-hv1's picture
Upload 3 files
7fd62bd verified
Raw
History Blame Contribute Delete
4.42 kB
/**
* src/utils/jsonCleaner.ts
*
* LLM responses almost never contain bare JSON. They typically:
* - Wrap JSON in ```json … ``` code fences
* - Prepend sentences like "Here is the JSON:"
* - Mix prose before/after the JSON block
* - Contain nested braces inside string literals
*
* This module extracts the first valid JSON object `{}` or array `[]`
* from any arbitrary LLM response string using a character-level scan
* that correctly handles strings, escapes, and nesting.
*/
// ─── Public API ───────────────────────────────────────────────────────────────
/**
* Extracts and parses the first JSON object or array from an LLM response.
*
* @param raw - The full LLM response text
* @returns The parsed value
* @throws Error if no valid JSON can be extracted
*/
export function extractJson<T = unknown>(raw: string): T {
if (!raw || typeof raw !== 'string') {
throw new Error('extractJson: input must be a non-empty string');
}
// ── Strategy 1: Strip a markdown code fence if present ──────────────────
const fenceMatch = raw.match(
/```(?:json|yaml|js|javascript|typescript|dart)?\s*([\s\S]*?)```/i
);
if (fenceMatch) {
const inner = fenceMatch[1].trim();
try {
return JSON.parse(inner) as T;
} catch {
// Fenced block wasn't valid JSON β€” fall through
}
}
// ── Strategy 2: Find first `{` or `[` and matching close bracket ────────
const objStart = raw.indexOf('{');
const arrStart = raw.indexOf('[');
// Determine which comes first
let start: number;
let openChar: '{' | '[';
let closeChar: '}' | ']';
if (objStart === -1 && arrStart === -1) {
throw new Error(
`extractJson: no JSON object or array found in response.\n` +
`Response preview: ${raw.slice(0, 200)}`
);
}
if (arrStart === -1 || (objStart !== -1 && objStart < arrStart)) {
start = objStart;
openChar = '{';
closeChar = '}';
} else {
start = arrStart;
openChar = '[';
closeChar = ']';
}
// Character-level scan: track depth, skip string contents
let depth = 0;
let inString = false;
let escape = false;
let end = -1;
for (let i = start; i < raw.length; i++) {
const ch = raw[i];
if (escape) {
escape = false;
continue;
}
if (inString) {
if (ch === '\\') escape = true;
else if (ch === '"') inString = false;
continue;
}
// Not in a string
if (ch === '"') { inString = true; continue; }
if (ch === openChar) { depth++; continue; }
if (ch === closeChar) {
depth--;
if (depth === 0) { end = i; break; }
}
}
if (end === -1) {
throw new Error(
`extractJson: unbalanced brackets in LLM response.\n` +
`Response preview: ${raw.slice(start, start + 300)}`
);
}
const candidate = raw.slice(start, end + 1);
try {
return JSON.parse(candidate) as T;
} catch (parseErr) {
// Last resort: attempt to fix common LLM JSON mistakes
const repaired = attemptRepair(candidate);
if (repaired !== null) return repaired as T;
throw new Error(
`extractJson: JSON.parse failed.\n` +
`Error: ${String(parseErr)}\n` +
`Candidate (first 300 chars): ${candidate.slice(0, 300)}`
);
}
}
/**
* Non-throwing variant of extractJson.
* Returns `null` if extraction or parsing fails.
*/
export function tryExtractJson<T = unknown>(raw: string): T | null {
try {
return extractJson<T>(raw);
} catch {
return null;
}
}
// ─── Private Helpers ──────────────────────────────────────────────────────────
/**
* Attempts common repairs to malformed LLM JSON:
* - Trailing commas before `}` or `]`
* - Single-quoted strings
* - Unquoted property keys (limited cases)
*/
function attemptRepair(json: string): unknown | null {
let repaired = json;
// Remove trailing commas in objects/arrays
repaired = repaired.replace(/,\s*([}\]])/g, '$1');
// Replace single-quoted strings with double-quoted
repaired = repaired.replace(/'/g, '"');
try {
return JSON.parse(repaired);
} catch {
return null;
}
}