maxime-antoine-dev commited on
Commit
e09b7e5
·
verified ·
1 Parent(s): d9b52fd

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +109 -0
utils.py CHANGED
@@ -61,6 +61,115 @@ def extract_first_json_obj(s: str) -> Optional[Dict[str, Any]]:
61
  return None
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # ----------------------------
65
  # Post-processing: remove template sentence
66
  # ----------------------------
 
61
  return None
62
 
63
 
64
+ # ----------------------------
65
+ # Extra robustness: remove stray unquoted fields (e.g., `confidence: 0.75`)
66
+ # that sometimes appear outside JSON strings due to generation glitches.
67
+ # ----------------------------
68
+ def _remove_unquoted_confidence_field(json_text: str) -> str:
69
+ """
70
+ Removes an unquoted trailing field like `confidence: 0.75` that appears
71
+ outside strings in otherwise-valid JSON output. This is a targeted fix
72
+ for common LLM glitches and intentionally conservative (only triggers
73
+ when we are NOT inside a quoted string).
74
+ """
75
+ out_chars: List[str] = []
76
+ i = 0
77
+ in_str = False
78
+ esc = False
79
+
80
+ def _pop_trailing_ws_and_optional_comma():
81
+ # remove trailing whitespace
82
+ while out_chars and out_chars[-1].isspace():
83
+ out_chars.pop()
84
+ # remove trailing comma (and whitespace before it)
85
+ if out_chars and out_chars[-1] == ",":
86
+ out_chars.pop()
87
+ while out_chars and out_chars[-1].isspace():
88
+ out_chars.pop()
89
+
90
+ while i < len(json_text):
91
+ ch = json_text[i]
92
+
93
+ if in_str:
94
+ out_chars.append(ch)
95
+ if esc:
96
+ esc = False
97
+ elif ch == "\\": # escape
98
+ esc = True
99
+ elif ch == '"':
100
+ in_str = False
101
+ i += 1
102
+ continue
103
+
104
+ if ch == '"':
105
+ in_str = True
106
+ out_chars.append(ch)
107
+ i += 1
108
+ continue
109
+
110
+ # Detect an unquoted `confidence: <number>` outside strings.
111
+ # Only remove if followed by a number and then a delimiter (`,` or `}`).
112
+ if json_text.startswith("confidence", i):
113
+ j = i + len("confidence")
114
+ while j < len(json_text) and json_text[j].isspace():
115
+ j += 1
116
+ if j < len(json_text) and json_text[j] == ":":
117
+ j += 1
118
+ while j < len(json_text) and json_text[j].isspace():
119
+ j += 1
120
+
121
+ # parse a simple number
122
+ if j < len(json_text) and json_text[j] in "+-":
123
+ j += 1
124
+ has_digit = False
125
+ while j < len(json_text) and json_text[j].isdigit():
126
+ has_digit = True
127
+ j += 1
128
+ if j < len(json_text) and json_text[j] == ".":
129
+ j += 1
130
+ while j < len(json_text) and json_text[j].isdigit():
131
+ has_digit = True
132
+ j += 1
133
+
134
+ if has_digit:
135
+ k = j
136
+ while k < len(json_text) and json_text[k].isspace():
137
+ k += 1
138
+ if k < len(json_text) and json_text[k] in {",", "}"}:
139
+ _pop_trailing_ws_and_optional_comma()
140
+ i = k # keep delimiter
141
+ continue
142
+
143
+ out_chars.append(ch)
144
+ i += 1
145
+
146
+ return "".join(out_chars)
147
+
148
+
149
+ def extract_json_obj_robust(s: str) -> Optional[Dict[str, Any]]:
150
+ """
151
+ Extract and parse the first JSON object from a model output string.
152
+
153
+ - Cuts at the first complete `{...}` (brace-balanced while respecting strings).
154
+ - Repairs a common glitch: an unquoted trailing `confidence: <num>`.
155
+ - Returns a dict if parsing succeeds, else None.
156
+ """
157
+ cut = stop_at_complete_json(s) or s
158
+ start = cut.find("{")
159
+ end = cut.rfind("}")
160
+ if start == -1 or end == -1 or end <= start:
161
+ return None
162
+
163
+ cand = cut[start : end + 1].strip()
164
+ cand = cand.replace("```json", "").replace("```", "").strip()
165
+ cand = _remove_unquoted_confidence_field(cand)
166
+
167
+ try:
168
+ return json.loads(cand)
169
+ except Exception:
170
+ return None
171
+
172
+
173
  # ----------------------------
174
  # Post-processing: remove template sentence
175
  # ----------------------------