Spaces:
Build error
Build error
File size: 4,847 Bytes
441d880 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | import re
import json
def parse_expected_output_fields(expected_output_text):
"""
Parses expected_output_text into a list of (key, description) tuples.
"""
fields = []
lines = expected_output_text.strip().splitlines()
for line in lines:
if ':' not in line:
continue
key, description = line.split(':', 1)
fields.append((key.strip(), description.strip()))
return fields
def extract_fields_from_expected_output(expected_output_text):
"""
Returns just the list of keys (field names) from expected_output_text.
"""
parsed_fields = parse_expected_output_fields(expected_output_text)
return [key for key, _ in parsed_fields]
def split_json_string(text):
"""
Best of both worlds:
- Splits text into 'thought' and 'JSON' parts
- Scans for all possible { positions
- Cleans unescaped newlines inside quotes
- Strips junk between </think> and JSON if JSON exists
- Preserves full text after </think> if no JSON
"""
# Step 1: Split at </think> if exists
if '</think>' in text:
thought_part, possible_json_part = text.split('</think>', 1)
thought_part = thought_part.strip()
possible_json_part = possible_json_part.strip()
else:
thought_part = None
possible_json_part = text.strip()
# Step 2: Find all { positions
brace_positions = [m.start() for m in re.finditer(r'{', possible_json_part)]
# Clean function: fix newlines inside quoted strings
def clean_json_formatting(text):
def fix_inside_quotes(match):
content = match.group(1)
fixed = content.replace('\n', '\\n').replace('\r', '\\n')
return f'"{fixed}"'
return re.sub(r'"(.*?)"', fix_inside_quotes, text, flags=re.DOTALL)
for pos in brace_positions:
candidate = possible_json_part[pos:].strip()
# Pre-clean
candidate = clean_json_formatting(candidate)
# Fix double braces if necessary
if candidate.startswith("{{") and "}}" in candidate:
candidate = candidate.replace("{{", "{", 1).replace("}}", "}", 1)
# Must start with {" or {'
if not re.match(r'^\{\s*["\']', candidate):
continue # not real JSON, skip
try:
json.loads(candidate)
# β
Successful parse
return thought_part, candidate
except json.JSONDecodeError:
continue # try next
# π No valid JSON found β return thought and full original remainder (no chopping)
return thought_part, possible_json_part
def extract_and_parse_json(result_text):
"""
Extracts and parses JSON output, handling cases where JSON is enclosed in triple backticks
(```json ... ```) or already correctly formatted `{}`.
Args:
result_text (str): The raw text output containing JSON data.
Returns:
dict or None: Parsed JSON object if successful, None otherwise.
"""
if not result_text:
print("π¨ No result text data received.")
return None
# π Clean unescaped line breaks that often break LLM JSON output
def clean_json_formatting(text):
# Replace unescaped newlines with a space
return re.sub(r'(?<!\\)\n', ' ', text)
# β
Try parsing directly after cleaning line breaks
cleaned_direct = clean_json_formatting(result_text)
try:
return json.loads(cleaned_direct)
except json.JSONDecodeError:
print("Unable to parse cleaned direct JSON.")
pass
# β
Try extracting JSON from triple backticks
match = re.search(r'```json\s*\n({[\s\S]+?})\n```', result_text, re.DOTALL)
if match:
try:
return json.loads(match.group(1).strip())
except json.JSONDecodeError:
pass # If still invalid, return None
print("π¨ No valid JSON found.")
return None # No valid JSON detected
def generate_json_expected_output(expected_output_text):
"""
Generates a JSON-style expected output based on expected_output_text.
"""
parsed_fields = parse_expected_output_fields(expected_output_text)
json_fields = []
for key, description in parsed_fields:
# Convert to JSON-style key (lowercase, underscores preserved)
json_key = key.lower()
json_fields.append(f' "{json_key}": {description},')
# Remove trailing comma from the last entry
if json_fields:
json_fields[-1] = json_fields[-1].rstrip(',')
# Join fields
json_body = "\n".join(json_fields)
output = (
"You must return your answer strictly in the following JSON format. "
"Do not include any markdown, commentary, or extra text. The response must be valid JSON:\n\n"
"{\n"
f"{json_body}\n"
"}"
)
return output
|