File size: 4,946 Bytes
61f8894 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
"""
JSON parsing utilities for handling malformed LLM outputs.
Provides robust parsing with fallback strategies.
"""
import json
import re
from typing import Any, Dict, Optional
def parse_llm_json(text: str) -> Optional[Dict[str, Any]]:
"""
Parse JSON from LLM output with multiple fallback strategies.
Handles common issues:
- Extra text before/after JSON
- Line breaks within JSON
- Single quotes instead of double quotes
- Trailing commas
Args:
text: Raw text from LLM that should contain JSON
Returns:
Parsed JSON dictionary or None if parsing fails
"""
if not text or not isinstance(text, str):
return None
# Strategy 1: Try direct parsing
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Strategy 2: Extract JSON from text (look for {...} or [...])
try:
# Find first { and last }
start = text.find("{")
end = text.rfind("}")
if start != -1 and end != -1 and end > start:
json_str = text[start : end + 1]
return json.loads(json_str)
except json.JSONDecodeError:
pass
# Strategy 3: Try to extract JSON array
try:
start = text.find("[")
end = text.rfind("]")
if start != -1 and end != -1 and end > start:
json_str = text[start : end + 1]
return json.loads(json_str)
except json.JSONDecodeError:
pass
# Strategy 4: Fix common issues and retry
try:
# Remove line breaks within JSON
cleaned = re.sub(r"\n\s*", " ", text)
# Extract JSON portion
start = cleaned.find("{")
end = cleaned.rfind("}")
if start != -1 and end != -1 and end > start:
json_str = cleaned[start : end + 1]
# Replace single quotes with double quotes (carefully)
# This is a simple heuristic and may not work for all cases
json_str = json_str.replace("'", '"')
# Remove trailing commas before } or ]
json_str = re.sub(r",(\s*[}\]])", r"\1", json_str)
return json.loads(json_str)
except (json.JSONDecodeError, Exception):
pass
# Strategy 5: Try to parse as key-value pairs using regex
try:
# Look for key: value patterns
pattern = r'"?(\w+)"?\s*:\s*"?([^",}\]]+)"?'
matches = re.findall(pattern, text)
if matches:
result = {}
for key, value in matches:
# Try to parse value as number if possible
try:
if "." in value:
result[key] = float(value)
else:
result[key] = int(value)
except ValueError:
result[key] = value.strip()
if result:
return result
except Exception:
pass
return None
def parse_tool_input(input_str: str) -> Dict[str, Any]:
"""
Parse tool input from LLM, handling both string and JSON inputs.
Args:
input_str: Input string from LLM (may be JSON or plain string)
Returns:
Dictionary with parsed values
"""
# If it's already a dict, return it
if isinstance(input_str, dict):
return input_str
# Try to parse as JSON
parsed = parse_llm_json(input_str)
if parsed:
return parsed
# If it's a simple string that might be a user_id, wrap it
if isinstance(input_str, str):
input_str = input_str.strip().strip('"').strip("'")
# Check if it looks like JSON but failed to parse
if "{" in input_str or "[" in input_str:
# Return empty dict to signal parsing failure
return {}
# If it's a simple value, treat it as user_id
if not any(char in input_str for char in ["{", "}", "[", "]", ":"]):
return {"user_id": input_str}
return {}
def extract_json_value(text: str, key: str, default: Any = None) -> Any:
"""
Extract a specific value from JSON text without full parsing.
Args:
text: Text containing JSON
key: Key to extract
default: Default value if key not found
Returns:
Extracted value or default
"""
try:
parsed = parse_llm_json(text)
if parsed and isinstance(parsed, dict):
return parsed.get(key, default)
except Exception:
pass
# Try regex extraction as fallback
try:
pattern = rf'"{key}"\s*:\s*"?([^",}}\]]+)"?'
match = re.search(pattern, text)
if match:
value = match.group(1).strip()
# Try to convert to number
try:
if "." in value:
return float(value)
return int(value)
except ValueError:
return value
except Exception:
pass
return default
|