File size: 4,847 Bytes
441d880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import re
import json

def parse_expected_output_fields(expected_output_text):
    """
    Parses expected_output_text into a list of (key, description) tuples.
    """
    fields = []
    lines = expected_output_text.strip().splitlines()
    for line in lines:
        if ':' not in line:
            continue
        key, description = line.split(':', 1)
        fields.append((key.strip(), description.strip()))
    return fields

def extract_fields_from_expected_output(expected_output_text):
    """
    Returns just the list of keys (field names) from expected_output_text.
    """
    parsed_fields = parse_expected_output_fields(expected_output_text)
    return [key for key, _ in parsed_fields]

def split_json_string(text):
    """
    Best of both worlds:
    - Splits text into 'thought' and 'JSON' parts
    - Scans for all possible { positions
    - Cleans unescaped newlines inside quotes
    - Strips junk between </think> and JSON if JSON exists
    - Preserves full text after </think> if no JSON
    """

    # Step 1: Split at </think> if exists
    if '</think>' in text:
        thought_part, possible_json_part = text.split('</think>', 1)
        thought_part = thought_part.strip()
        possible_json_part = possible_json_part.strip()
    else:
        thought_part = None
        possible_json_part = text.strip()

    # Step 2: Find all { positions
    brace_positions = [m.start() for m in re.finditer(r'{', possible_json_part)]

    # Clean function: fix newlines inside quoted strings
    def clean_json_formatting(text):
        def fix_inside_quotes(match):
            content = match.group(1)
            fixed = content.replace('\n', '\\n').replace('\r', '\\n')
            return f'"{fixed}"'
        return re.sub(r'"(.*?)"', fix_inside_quotes, text, flags=re.DOTALL)

    for pos in brace_positions:
        candidate = possible_json_part[pos:].strip()

        # Pre-clean
        candidate = clean_json_formatting(candidate)

        # Fix double braces if necessary
        if candidate.startswith("{{") and "}}" in candidate:
            candidate = candidate.replace("{{", "{", 1).replace("}}", "}", 1)

        # Must start with {" or {'
        if not re.match(r'^\{\s*["\']', candidate):
            continue  # not real JSON, skip

        try:
            json.loads(candidate)
            # βœ… Successful parse
            return thought_part, candidate
        except json.JSONDecodeError:
            continue  # try next

    # πŸ›  No valid JSON found β€” return thought and full original remainder (no chopping)
    return thought_part, possible_json_part

def extract_and_parse_json(result_text):
    """
    Extracts and parses JSON output, handling cases where JSON is enclosed in triple backticks
    (```json ... ```) or already correctly formatted `{}`.

    Args:
        result_text (str): The raw text output containing JSON data.

    Returns:
        dict or None: Parsed JSON object if successful, None otherwise.
    """
    if not result_text:
        print("🚨 No result text data received.")
        return None

    # πŸ›  Clean unescaped line breaks that often break LLM JSON output
    def clean_json_formatting(text):
        # Replace unescaped newlines with a space
        return re.sub(r'(?<!\\)\n', ' ', text)

    # βœ… Try parsing directly after cleaning line breaks
    cleaned_direct = clean_json_formatting(result_text)
    try:
        return json.loads(cleaned_direct)
    except json.JSONDecodeError:
        print("Unable to parse cleaned direct JSON.")
        pass

    # βœ… Try extracting JSON from triple backticks
    match = re.search(r'```json\s*\n({[\s\S]+?})\n```', result_text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1).strip())
        except json.JSONDecodeError:
            pass  # If still invalid, return None

    print("🚨 No valid JSON found.")
    return None  # No valid JSON detected

def generate_json_expected_output(expected_output_text):
    """
    Generates a JSON-style expected output based on expected_output_text.
    """
    parsed_fields = parse_expected_output_fields(expected_output_text)

    json_fields = []
    for key, description in parsed_fields:
        # Convert to JSON-style key (lowercase, underscores preserved)
        json_key = key.lower()
        json_fields.append(f'    "{json_key}": {description},')

    # Remove trailing comma from the last entry
    if json_fields:
        json_fields[-1] = json_fields[-1].rstrip(',')

    # Join fields
    json_body = "\n".join(json_fields)

    output = (
        "You must return your answer strictly in the following JSON format. "
        "Do not include any markdown, commentary, or extra text. The response must be valid JSON:\n\n"
        "{\n"
        f"{json_body}\n"
        "}"
    )

    return output