cmboulanger's picture
full implementation
37eaffd
from __future__ import annotations
import json
import re
from typing import Callable
from ..models.spans import SpanDescriptor
def _strip_fences(text: str) -> str:
"""Remove markdown code fences, even if preceded by explanatory text."""
text = text.strip()
m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", text, re.DOTALL)
if m:
return m.group(1).strip()
return text
def _parse_json_list(text: str) -> list[dict] | None:
"""Parse text as a JSON list; return None on failure."""
try:
result = json.loads(text)
return result if isinstance(result, list) else None
except json.JSONDecodeError:
return None
def _dicts_to_spans(raw: list[dict]) -> list[SpanDescriptor]:
spans: list[SpanDescriptor] = []
for item in raw:
if not isinstance(item, dict):
continue
element = item.get("element", "")
text = item.get("text", "")
context = item.get("context", "")
attrs = item.get("attrs", {})
if not (element and text and context):
continue
spans.append(
SpanDescriptor(
element=element,
text=text,
context=context,
attrs=attrs if isinstance(attrs, dict) else {},
)
)
return spans
def parse_response(
response: str,
call_fn: Callable[[str], str] | None = None,
make_correction_prompt: Callable[[str, str], str] | None = None,
) -> list[SpanDescriptor]:
"""
Parse an LLM response string into a list of SpanDescriptors.
- Strips markdown code fences automatically.
- If parsing fails and *call_fn* + *make_correction_prompt* are provided,
retries once with a self-correction prompt that includes the bad response.
- Raises ValueError if parsing fails after the retry (or if no retry is configured).
"""
cleaned = _strip_fences(response)
raw = _parse_json_list(cleaned)
if raw is not None:
return _dicts_to_spans(raw)
if call_fn is None or make_correction_prompt is None:
raise ValueError(f"Failed to parse JSON from response: {response[:300]!r}")
error_msg = "Response is not valid JSON"
correction_prompt = make_correction_prompt(response, error_msg)
retry_response = call_fn(correction_prompt)
retry_cleaned = _strip_fences(retry_response)
raw = _parse_json_list(retry_cleaned)
if raw is None:
raise ValueError(f"Failed to parse JSON after retry: {retry_response[:300]!r}")
return _dicts_to_spans(raw)