calculus-agent / backend /tests /test_planner_bug.py
Đỗ Hải Nam
feat(backend): core multi-agent orchestration and API
ba5110e
import json
import re
# The string exactly as the user reported (simulating LLM output)
# Note: In Python string literal, I need to represent what the LLM likely outputted.
# If LLM outputted: "content": "\frac..."
# That is invalid JSON. It should be "\\frac..."
llm_output = r"""
{
"questions": [
{
"id": 1,
"content": "Tính tích phân $\iint\limits_{D} \frac{x^2 + 2}{x^2 + y^2 + 4} \, dxdy$, với $D$ là miền giới hạn bởi hình vuông $|x| + |y| = 1$.",
"type": "code",
"tool_input": "Viết code Python để tính tích phân kép của hàm f(x,y) = (x^2 + 2)/(x^2 + y^2 + 4) trên miền D là hình vuông |x| + |y| = 1"
},
{
"id": 2,
"content": "Tính tích phân $\iint\limits_{D} \frac{y^2 + 8}{x^2 + y^2 + 16} \, dxdy$, với $D$ là miền giới hạn bởi hình vuông $|x| + |y| = 2$.",
"type": "code",
"tool_input": "Viết code Python để tính tích phân kép của hàm f(x,y) = (y^2 + 8)/(x^2 + y^2 + 16) trên miền D là hình vuông |x| + |y| = 2"
}
]
}
"""
print("--- Testing Raw JSON Load ---")
try:
data = json.loads(llm_output)
print("✅ JSON Load Success")
except json.JSONDecodeError as e:
print(f"❌ JSON Load Failed: {e}")
print("\n--- Testing Regex Fix Strategy ---")
# Strategy: Look for backslashes that are NOT followed by specific JSON control chars
# But in JSON, only \", \\, \/, \b, \f, \n, \r, \t, \uXXXX contain backslashes.
# LaTeX backslashes like \f in \frac are form feeds? No, \f is form feed.
# \i in \iint is invalid.
def fix_json_latex(text):
"""
Repair JSON string containing unescaped LaTeX backslashes.
Example: "\frac" -> "\\frac"
"""
# Pattern: Match a backslash that is NOT followed by valid JSON escape chars
# Valid escapes: " \ / b f n r t u
# Note: \u needs 4 hex digits.
# Negative lookahead is useful here.
# We want to match \ where next char is NOT one of " \ / b f n r t u
# But wait, \f is Form Feed in JSON. In LaTeX it is \frac.
# If LLM outputs "\frac", Python sees `\f` (form feed) + `rac`?
# No, we get the raw string from LLM.
# LLM outputting literal "\frac" means backslash + f + r + a + c.
# In JSON string "\frac", the parser sees `\f` (escape for form feed) + `rac`. Valid syntax? Yes.
# But "\iint": `\i` is Invalid escape.
# So the problem is mainly mostly invalid escapes like \i, \l, \s, \x, etc.
# AND valid escapes that are actually LaTeX (like \t -> tab, but meant \text).
# HEURISTIC: Double ALL backslashes, then un-double the valid JSON control ones?
# No, that's messy.
# Better: Match `\` that is followed by something looking like a LaTeX command (alpha chars).
# But technically `\n` is Newline.
# Robust Strategy used in other projects:
# 1. Replace `\\` with `ROOT_BACKSLASH_PLACEHOLDER`
# 2. Replace `\` with `\\` IF it's not a valid escape?
# Let's try simple regex: escape ALL backslashes first?
# LLM usually sends plain text.
# If we do `text.replace("\\", "\\\\")`, then `\n` becomes `\\n` (literal \n).
# `json.loads` will read it as literally backslash+n.
# This might be SAFER for content fields!
# But we have structure: `{"questions": ...}`. We don't want to break `\"` for quotes.
# Correct Regex: Match `\` that is NOT followed by `"` (quote).
# Because we assume structure uses quotes.
# But what about `\n` inside the content?
# If LLM meant newline, it sends `\n`. If we escape it to `\\n`, we get literal \n.
# If LLM meant LaTeX `\frac`, it sends `\f...`. If we escape to `\\f...`, we get literal \f... (which is what we want for LaTeX source).
# So escaping `\` -> `\\` is generally safe EXCEPT for:
# 1. `\"` (which closes the string) -> We MUST keep `\"` as `\"` (escaped quote).
# 2. `\\` (literal backslash) -> We probably want to keep it or double it?
# Proposal:
# Replace `\` with `\\` UNLESS it is followed by `"`
new_text = re.sub(r'\\(?!"|u[0-9a-fA-F]{4})', r'\\\\', text)
# Exclude unicode \uXXXX too
# Also need to NOT double existing double backslashes?
# Text: `\\frac` -> regex sees backslash, not followed by quote -> `\\\\frac`.
# `json.loads` sees `\\` -> literal backslash. `frac` -> literal frac. Result: `\frac`. Correct.
# Text: `\frac` -> regex sees backslash -> `\\frac`.
# `json.loads` sees `\` (invalid?) -> No, `\\` becomes `\`. `frac`. Result: `\frac`.
# Wait, `json.loads("\\frac")` -> in python string `\\frac`. Parser see `\` then `f`. `\f` is valid escape?
# No, `\\` in JSON string means "Literal Backslash".
# So `{"a": "\\frac"}` -> python dict `{'a': '\\frac'}`.
# The Regex `r'\\(?!"|u[0-9a-fA-F]{4})'` matches any backslash NOT followed by quote or unicode.
# Replacement: `\\\\` (double backslash string, usually means 2 chars `\` `\`).
return new_text
print(f"Original len: {len(llm_output)}")
fixed = fix_json_latex(llm_output)
print(f"Fixed start: {fixed[:100]}...")
try:
data = json.loads(fixed)
print("✅ Repair Success!")
print(f"Question 1 Content: {data['questions'][0]['content'][:50]}...")
except json.JSONDecodeError as e:
print(f"❌ Repair Failed: {e}")