#!/usr/bin/env python3 """ Robust JSON Extraction from LLM Output ======================================= LLMs frequently wrap JSON in markdown, add conversational preamble/postamble, use Python-style booleans, or output malformed JSON. This module handles all of those cases with a multi-strategy approach. Extracted from production tool calling orchestrator. Battle-tested against Hermes-3, Llama 3.3, Qwen2, and Mistral models. Usage: from robust_json_extraction import extract_json, extract_tool_calls # Handle any LLM output format data = extract_json('Here is the result: ```json\\n{"key": "value"}\\n``` Hope that helps!') # Extract tool calls from Hermes-format XML calls = extract_tool_calls('{"name": "search", "arguments": {"q": "test"}}') """ import json import re import ast import xml.etree.ElementTree as ET from json import JSONDecoder from typing import Any, Dict, List, Optional def extract_json(text: str) -> Any: """ Extract JSON from LLM output, handling common issues: 1. Markdown code blocks (```json ... ```) 2. Preamble text ("Here is the result: {...") 3. Postamble text ("...} Let me know if you need help!") 4. Python-style booleans (True/False/None instead of true/false/null) Returns parsed JSON data (dict, list, etc.) Raises json.JSONDecodeError if no valid JSON can be extracted. """ text = text.strip() if not text: raise json.JSONDecodeError("Empty input", text, 0) # Layer 1: Strip markdown code blocks if "```" in text: # Match ```json or ``` followed by content until closing ``` match = re.search(r'```(?:json)?\s*\n(.*?)\n```', text, re.DOTALL) if match: text = match.group(1).strip() else: # Handle unclosed code blocks start = text.find('```') if start != -1: first_newline = text.find('\n', start) if first_newline != -1: text = text[first_newline + 1:] if text.endswith("```"): text = text[:-3].strip() # Layer 2: Find first { or [ (skip preamble) if not text.startswith(('{', '[')): for char in ['{', '[']: idx = text.find(char) if idx != -1: text = text[idx:] break # Layer 3: Try parsing, with raw_decode fallback for postamble try: return json.loads(text) except json.JSONDecodeError as original_error: # Try raw_decode — stops at end of valid JSON, ignoring trailing text decoder = JSONDecoder() try: data, _ = decoder.raw_decode(text) return data except json.JSONDecodeError: pass # Try fixing Python-style booleans/None try: fixed = text.replace('True', 'true').replace('False', 'false').replace('None', 'null') return json.loads(fixed) except json.JSONDecodeError: pass # Try raw_decode on fixed text try: fixed = text.replace('True', 'true').replace('False', 'false').replace('None', 'null') data, _ = decoder.raw_decode(fixed) return data except json.JSONDecodeError: pass raise original_error def parse_single_call(json_text: str) -> Optional[Dict]: """ Parse a single tool call JSON using multiple strategies. Returns dict with 'name' and 'arguments' keys, or None if parsing fails. """ json_text = json_text.strip() if not json_text: return None # Strategy 1: Standard JSON try: return json.loads(json_text) except json.JSONDecodeError: pass # Strategy 2: Fix JS booleans and use ast.literal_eval try: python_text = json_text.replace('true', 'True').replace('false', 'False').replace('null', 'None') return ast.literal_eval(python_text) except (SyntaxError, ValueError): pass # Strategy 3: Fix Python->JSON issues (single quotes, capitalized booleans) try: fixed = json_text.replace("'", '"').replace('True', 'true').replace('False', 'false').replace('None', 'null') return json.loads(fixed) except (json.JSONDecodeError, ValueError): pass # Strategy 4: Regex extraction as last resort name_match = re.search(r"['\"]?name['\"]?\s*:\s*['\"]([^'\"]+)['\"]", json_text) if name_match: name = name_match.group(1) arguments = {} args_match = re.search(r"['\"]?arguments['\"]?\s*:\s*(\{[^}]+\})", json_text) if args_match: try: arguments = json.loads(args_match.group(1)) except json.JSONDecodeError: try: arguments = ast.literal_eval(args_match.group(1)) except (SyntaxError, ValueError): pass return {"name": name, "arguments": arguments} return None def extract_tool_calls(assistant_message: str) -> List[Dict]: """ Extract tool calls from an assistant message containing XML tags. Supports: - Single tool call: {"name": "fn", "arguments": {...}} - Nested format: {"tool_calls": [...]} - Multiple JSON objects in one block (line-by-line) - Malformed XML (regex fallback) Returns list of dicts, each with 'name' and 'arguments' keys. """ tool_calls = [] # Try XML parsing first try: xml_root = f"{assistant_message}" root = ET.fromstring(xml_root) for element in root.findall(".//tool_call"): raw_text = (element.text or "").strip() if not raw_text: continue # Try parsing as single JSON object json_data = parse_single_call(raw_text) if json_data: # Check for nested tool_calls array if isinstance(json_data, dict) and 'tool_calls' in json_data: nested = json_data.get('tool_calls', []) if isinstance(nested, list): tool_calls.extend(nested) elif isinstance(json_data, dict) and 'name' in json_data: tool_calls.append(json_data) else: # Fallback: line-by-line parsing for line in raw_text.split('\n'): line = line.strip() if line.startswith('{'): parsed = parse_single_call(line) if parsed: tool_calls.append(parsed) except ET.ParseError: # Regex fallback for malformed XML pattern = re.compile(r'(.*?)', re.DOTALL) for match in pattern.findall(assistant_message): raw_text = match.strip() json_data = parse_single_call(raw_text) if json_data: if isinstance(json_data, dict) and 'tool_calls' in json_data: tool_calls.extend(json_data.get('tool_calls', [])) elif isinstance(json_data, dict) and 'name' in json_data: tool_calls.append(json_data) return tool_calls # ============================================================================ # Examples / Self-test # ============================================================================ if __name__ == "__main__": print("=" * 60) print("Testing robust JSON extraction") print("=" * 60) # Test 1: Clean JSON assert extract_json('{"key": "value"}') == {"key": "value"} print(" [PASS] Clean JSON") # Test 2: Markdown-wrapped JSON assert extract_json('```json\n{"key": "value"}\n```') == {"key": "value"} print(" [PASS] Markdown-wrapped JSON") # Test 3: Preamble + JSON assert extract_json('Here is the result: {"key": "value"}') == {"key": "value"} print(" [PASS] Preamble text") # Test 4: JSON + postamble assert extract_json('{"key": "value"} Hope that helps!') == {"key": "value"} print(" [PASS] Postamble text") # Test 5: Preamble + markdown + postamble result = extract_json('Sure! ```json\n{"key": "value"}\n``` Let me know!') assert result == {"key": "value"} print(" [PASS] Preamble + markdown + postamble") # Test 6: Python-style booleans assert extract_json('{"active": True, "deleted": False, "value": None}') == { "active": True, "deleted": False, "value": None } print(" [PASS] Python-style booleans") print("\n" + "=" * 60) print("Testing tool call extraction") print("=" * 60) # Test 7: Single tool call calls = extract_tool_calls('{"name": "search", "arguments": {"q": "test"}}') assert len(calls) == 1 assert calls[0]["name"] == "search" print(" [PASS] Single tool call") # Test 8: Nested tool_calls array calls = extract_tool_calls( '{"tool_calls": [{"name": "a", "arguments": {}}, {"name": "b", "arguments": {}}]}' ) assert len(calls) == 2 print(" [PASS] Nested tool_calls array") # Test 9: Mixed content calls = extract_tool_calls( 'I will search for that.\n\n{"name": "search", "arguments": {"q": "hello"}}\n\nDone.' ) assert len(calls) == 1 assert calls[0]["name"] == "search" print(" [PASS] Mixed content with tool call") print("\nAll tests passed.")