Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -61,6 +61,115 @@ def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
|
|
| 61 |
return None
|
| 62 |
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# ----------------------------
|
| 65 |
# Post-processing: remove template sentence
|
| 66 |
# ----------------------------
|
|
|
|
| 61 |
return None
|
| 62 |
|
| 63 |
|
| 64 |
+
# ----------------------------
|
| 65 |
+
# Extra robustness: remove stray unquoted fields (e.g., `confidence: 0.75`)
|
| 66 |
+
# that sometimes appear outside JSON strings due to generation glitches.
|
| 67 |
+
# ----------------------------
|
| 68 |
+
def _remove_unquoted_confidence_field(json_text: str) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Removes an unquoted trailing field like `confidence: 0.75` that appears
|
| 71 |
+
outside strings in otherwise-valid JSON output. This is a targeted fix
|
| 72 |
+
for common LLM glitches and intentionally conservative (only triggers
|
| 73 |
+
when we are NOT inside a quoted string).
|
| 74 |
+
"""
|
| 75 |
+
out_chars: List[str] = []
|
| 76 |
+
i = 0
|
| 77 |
+
in_str = False
|
| 78 |
+
esc = False
|
| 79 |
+
|
| 80 |
+
def _pop_trailing_ws_and_optional_comma():
|
| 81 |
+
# remove trailing whitespace
|
| 82 |
+
while out_chars and out_chars[-1].isspace():
|
| 83 |
+
out_chars.pop()
|
| 84 |
+
# remove trailing comma (and whitespace before it)
|
| 85 |
+
if out_chars and out_chars[-1] == ",":
|
| 86 |
+
out_chars.pop()
|
| 87 |
+
while out_chars and out_chars[-1].isspace():
|
| 88 |
+
out_chars.pop()
|
| 89 |
+
|
| 90 |
+
while i < len(json_text):
|
| 91 |
+
ch = json_text[i]
|
| 92 |
+
|
| 93 |
+
if in_str:
|
| 94 |
+
out_chars.append(ch)
|
| 95 |
+
if esc:
|
| 96 |
+
esc = False
|
| 97 |
+
elif ch == "\\": # escape
|
| 98 |
+
esc = True
|
| 99 |
+
elif ch == '"':
|
| 100 |
+
in_str = False
|
| 101 |
+
i += 1
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
if ch == '"':
|
| 105 |
+
in_str = True
|
| 106 |
+
out_chars.append(ch)
|
| 107 |
+
i += 1
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
# Detect an unquoted `confidence: <number>` outside strings.
|
| 111 |
+
# Only remove if followed by a number and then a delimiter (`,` or `}`).
|
| 112 |
+
if json_text.startswith("confidence", i):
|
| 113 |
+
j = i + len("confidence")
|
| 114 |
+
while j < len(json_text) and json_text[j].isspace():
|
| 115 |
+
j += 1
|
| 116 |
+
if j < len(json_text) and json_text[j] == ":":
|
| 117 |
+
j += 1
|
| 118 |
+
while j < len(json_text) and json_text[j].isspace():
|
| 119 |
+
j += 1
|
| 120 |
+
|
| 121 |
+
# parse a simple number
|
| 122 |
+
if j < len(json_text) and json_text[j] in "+-":
|
| 123 |
+
j += 1
|
| 124 |
+
has_digit = False
|
| 125 |
+
while j < len(json_text) and json_text[j].isdigit():
|
| 126 |
+
has_digit = True
|
| 127 |
+
j += 1
|
| 128 |
+
if j < len(json_text) and json_text[j] == ".":
|
| 129 |
+
j += 1
|
| 130 |
+
while j < len(json_text) and json_text[j].isdigit():
|
| 131 |
+
has_digit = True
|
| 132 |
+
j += 1
|
| 133 |
+
|
| 134 |
+
if has_digit:
|
| 135 |
+
k = j
|
| 136 |
+
while k < len(json_text) and json_text[k].isspace():
|
| 137 |
+
k += 1
|
| 138 |
+
if k < len(json_text) and json_text[k] in {",", "}"}:
|
| 139 |
+
_pop_trailing_ws_and_optional_comma()
|
| 140 |
+
i = k # keep delimiter
|
| 141 |
+
continue
|
| 142 |
+
|
| 143 |
+
out_chars.append(ch)
|
| 144 |
+
i += 1
|
| 145 |
+
|
| 146 |
+
return "".join(out_chars)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def extract_json_obj_robust(s: str) -> Optional[Dict[str, Any]]:
|
| 150 |
+
"""
|
| 151 |
+
Extract and parse the first JSON object from a model output string.
|
| 152 |
+
|
| 153 |
+
- Cuts at the first complete `{...}` (brace-balanced while respecting strings).
|
| 154 |
+
- Repairs a common glitch: an unquoted trailing `confidence: <num>`.
|
| 155 |
+
- Returns a dict if parsing succeeds, else None.
|
| 156 |
+
"""
|
| 157 |
+
cut = stop_at_complete_json(s) or s
|
| 158 |
+
start = cut.find("{")
|
| 159 |
+
end = cut.rfind("}")
|
| 160 |
+
if start == -1 or end == -1 or end <= start:
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
cand = cut[start : end + 1].strip()
|
| 164 |
+
cand = cand.replace("```json", "").replace("```", "").strip()
|
| 165 |
+
cand = _remove_unquoted_confidence_field(cand)
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
return json.loads(cand)
|
| 169 |
+
except Exception:
|
| 170 |
+
return None
|
| 171 |
+
|
| 172 |
+
|
| 173 |
# ----------------------------
|
| 174 |
# Post-processing: remove template sentence
|
| 175 |
# ----------------------------
|