import json import re # The string exactly as the user reported (simulating LLM output) # Note: In Python string literal, I need to represent what the LLM likely outputted. # If LLM outputted: "content": "\frac..." # That is invalid JSON. It should be "\\frac..." llm_output = r""" { "questions": [ { "id": 1, "content": "Tính tích phân $\iint\limits_{D} \frac{x^2 + 2}{x^2 + y^2 + 4} \, dxdy$, với $D$ là miền giới hạn bởi hình vuông $|x| + |y| = 1$.", "type": "code", "tool_input": "Viết code Python để tính tích phân kép của hàm f(x,y) = (x^2 + 2)/(x^2 + y^2 + 4) trên miền D là hình vuông |x| + |y| = 1" }, { "id": 2, "content": "Tính tích phân $\iint\limits_{D} \frac{y^2 + 8}{x^2 + y^2 + 16} \, dxdy$, với $D$ là miền giới hạn bởi hình vuông $|x| + |y| = 2$.", "type": "code", "tool_input": "Viết code Python để tính tích phân kép của hàm f(x,y) = (y^2 + 8)/(x^2 + y^2 + 16) trên miền D là hình vuông |x| + |y| = 2" } ] } """ print("--- Testing Raw JSON Load ---") try: data = json.loads(llm_output) print("✅ JSON Load Success") except json.JSONDecodeError as e: print(f"❌ JSON Load Failed: {e}") print("\n--- Testing Regex Fix Strategy ---") # Strategy: Look for backslashes that are NOT followed by specific JSON control chars # But in JSON, only \", \\, \/, \b, \f, \n, \r, \t, \uXXXX contain backslashes. # LaTeX backslashes like \f in \frac are form feeds? No, \f is form feed. # \i in \iint is invalid. def fix_json_latex(text): """ Repair JSON string containing unescaped LaTeX backslashes. Example: "\frac" -> "\\frac" """ # Pattern: Match a backslash that is NOT followed by valid JSON escape chars # Valid escapes: " \ / b f n r t u # Note: \u needs 4 hex digits. # Negative lookahead is useful here. # We want to match \ where next char is NOT one of " \ / b f n r t u # But wait, \f is Form Feed in JSON. In LaTeX it is \frac. # If LLM outputs "\frac", Python sees `\f` (form feed) + `rac`? # No, we get the raw string from LLM. # LLM outputting literal "\frac" means backslash + f + r + a + c. # In JSON string "\frac", the parser sees `\f` (escape for form feed) + `rac`. Valid syntax? Yes. # But "\iint": `\i` is Invalid escape. # So the problem is mainly mostly invalid escapes like \i, \l, \s, \x, etc. # AND valid escapes that are actually LaTeX (like \t -> tab, but meant \text). # HEURISTIC: Double ALL backslashes, then un-double the valid JSON control ones? # No, that's messy. # Better: Match `\` that is followed by something looking like a LaTeX command (alpha chars). # But technically `\n` is Newline. # Robust Strategy used in other projects: # 1. Replace `\\` with `ROOT_BACKSLASH_PLACEHOLDER` # 2. Replace `\` with `\\` IF it's not a valid escape? # Let's try simple regex: escape ALL backslashes first? # LLM usually sends plain text. # If we do `text.replace("\\", "\\\\")`, then `\n` becomes `\\n` (literal \n). # `json.loads` will read it as literally backslash+n. # This might be SAFER for content fields! # But we have structure: `{"questions": ...}`. We don't want to break `\"` for quotes. # Correct Regex: Match `\` that is NOT followed by `"` (quote). # Because we assume structure uses quotes. # But what about `\n` inside the content? # If LLM meant newline, it sends `\n`. If we escape it to `\\n`, we get literal \n. # If LLM meant LaTeX `\frac`, it sends `\f...`. If we escape to `\\f...`, we get literal \f... (which is what we want for LaTeX source). # So escaping `\` -> `\\` is generally safe EXCEPT for: # 1. `\"` (which closes the string) -> We MUST keep `\"` as `\"` (escaped quote). # 2. `\\` (literal backslash) -> We probably want to keep it or double it? # Proposal: # Replace `\` with `\\` UNLESS it is followed by `"` new_text = re.sub(r'\\(?!"|u[0-9a-fA-F]{4})', r'\\\\', text) # Exclude unicode \uXXXX too # Also need to NOT double existing double backslashes? # Text: `\\frac` -> regex sees backslash, not followed by quote -> `\\\\frac`. # `json.loads` sees `\\` -> literal backslash. `frac` -> literal frac. Result: `\frac`. Correct. # Text: `\frac` -> regex sees backslash -> `\\frac`. # `json.loads` sees `\` (invalid?) -> No, `\\` becomes `\`. `frac`. Result: `\frac`. # Wait, `json.loads("\\frac")` -> in python string `\\frac`. Parser see `\` then `f`. `\f` is valid escape? # No, `\\` in JSON string means "Literal Backslash". # So `{"a": "\\frac"}` -> python dict `{'a': '\\frac'}`. # The Regex `r'\\(?!"|u[0-9a-fA-F]{4})'` matches any backslash NOT followed by quote or unicode. # Replacement: `\\\\` (double backslash string, usually means 2 chars `\` `\`). return new_text print(f"Original len: {len(llm_output)}") fixed = fix_json_latex(llm_output) print(f"Fixed start: {fixed[:100]}...") try: data = json.loads(fixed) print("✅ Repair Success!") print(f"Question 1 Content: {data['questions'][0]['content'][:50]}...") except json.JSONDecodeError as e: print(f"❌ Repair Failed: {e}")