Spaces:

baeGil
/

calculus-agent

Sleeping

calculus-agent / backend /tests /test_planner_bug.py

Đỗ Hải Nam

feat(backend): core multi-agent orchestration and API

ba5110e about 2 months ago

5.34 kB

	import json
	import re

	# The string exactly as the user reported (simulating LLM output)
	# Note: In Python string literal, I need to represent what the LLM likely outputted.
	# If LLM outputted: "content": "\frac..."
	# That is invalid JSON. It should be "\\frac..."

	llm_output = r"""
	{
	"questions": [
	{
	"id": 1,
	"content": "Tính tích phân $\iint\limits_{D} \frac{x^2 + 2}{x^2 + y^2 + 4} \, dxdy$, với $D$ là miền giới hạn bởi hình vuông $\|x\| + \|y\| = 1$.",
	"type": "code",
	"tool_input": "Viết code Python để tính tích phân kép của hàm f(x,y) = (x^2 + 2)/(x^2 + y^2 + 4) trên miền D là hình vuông \|x\| + \|y\| = 1"
	},
	{
	"id": 2,
	"content": "Tính tích phân $\iint\limits_{D} \frac{y^2 + 8}{x^2 + y^2 + 16} \, dxdy$, với $D$ là miền giới hạn bởi hình vuông $\|x\| + \|y\| = 2$.",
	"type": "code",
	"tool_input": "Viết code Python để tính tích phân kép của hàm f(x,y) = (y^2 + 8)/(x^2 + y^2 + 16) trên miền D là hình vuông \|x\| + \|y\| = 2"
	}
	]
	}
	"""

	print("--- Testing Raw JSON Load ---")
	try:
	data = json.loads(llm_output)
	print("✅ JSON Load Success")
	except json.JSONDecodeError as e:
	print(f"❌ JSON Load Failed: {e}")

	print("\n--- Testing Regex Fix Strategy ---")
	# Strategy: Look for backslashes that are NOT followed by specific JSON control chars
	# But in JSON, only \", \\, \/, \b, \f, \n, \r, \t, \uXXXX contain backslashes.
	# LaTeX backslashes like \f in \frac are form feeds? No, \f is form feed.
	# \i in \iint is invalid.


	def fix_json_latex(text):
	"""
	Repair JSON string containing unescaped LaTeX backslashes.
	Example: "\frac" -> "\\frac"
	"""
	# Pattern: Match a backslash that is NOT followed by valid JSON escape chars
	# Valid escapes: " \ / b f n r t u
	# Note: \u needs 4 hex digits.

	# Negative lookahead is useful here.
	# We want to match \ where next char is NOT one of " \ / b f n r t u

	# But wait, \f is Form Feed in JSON. In LaTeX it is \frac.
	# If LLM outputs "\frac", Python sees `\f` (form feed) + `rac`?
	# No, we get the raw string from LLM.
	# LLM outputting literal "\frac" means backslash + f + r + a + c.
	# In JSON string "\frac", the parser sees `\f` (escape for form feed) + `rac`. Valid syntax? Yes.
	# But "\iint": `\i` is Invalid escape.

	# So the problem is mainly mostly invalid escapes like \i, \l, \s, \x, etc.
	# AND valid escapes that are actually LaTeX (like \t -> tab, but meant \text).

	# HEURISTIC: Double ALL backslashes, then un-double the valid JSON control ones?
	# No, that's messy.

	# Better: Match `\` that is followed by something looking like a LaTeX command (alpha chars).
	# But technically `\n` is Newline.

	# Robust Strategy used in other projects:
	# 1. Replace `\\` with `ROOT_BACKSLASH_PLACEHOLDER`
	# 2. Replace `\` with `\\` IF it's not a valid escape?

	# Let's try simple regex: escape ALL backslashes first?
	# LLM usually sends plain text.
	# If we do `text.replace("\\", "\\\\")`, then `\n` becomes `\\n` (literal \n).
	# `json.loads` will read it as literally backslash+n.
	# This might be SAFER for content fields!

	# But we have structure: `{"questions": ...}`. We don't want to break `\"` for quotes.

	# Correct Regex: Match `\` that is NOT followed by `"` (quote).
	# Because we assume structure uses quotes.
	# But what about `\n` inside the content?
	# If LLM meant newline, it sends `\n`. If we escape it to `\\n`, we get literal \n.
	# If LLM meant LaTeX `\frac`, it sends `\f...`. If we escape to `\\f...`, we get literal \f... (which is what we want for LaTeX source).

	# So escaping `\` -> `\\` is generally safe EXCEPT for:
	# 1. `\"` (which closes the string) -> We MUST keep `\"` as `\"` (escaped quote).
	# 2. `\\` (literal backslash) -> We probably want to keep it or double it?

	# Proposal:
	# Replace `\` with `\\` UNLESS it is followed by `"`

	new_text = re.sub(r'\\(?!"\|u[0-9a-fA-F]{4})', r'\\\\', text)
	# Exclude unicode \uXXXX too

	# Also need to NOT double existing double backslashes?
	# Text: `\\frac` -> regex sees backslash, not followed by quote -> `\\\\frac`.
	# `json.loads` sees `\\` -> literal backslash. `frac` -> literal frac. Result: `\frac`. Correct.
	# Text: `\frac` -> regex sees backslash -> `\\frac`.
	# `json.loads` sees `\` (invalid?) -> No, `\\` becomes `\`. `frac`. Result: `\frac`.

	# Wait, `json.loads("\\frac")` -> in python string `\\frac`. Parser see `\` then `f`. `\f` is valid escape?
	# No, `\\` in JSON string means "Literal Backslash".
	# So `{"a": "\\frac"}` -> python dict `{'a': '\\frac'}`.

	# The Regex `r'\\(?!"\|u[0-9a-fA-F]{4})'` matches any backslash NOT followed by quote or unicode.
	# Replacement: `\\\\` (double backslash string, usually means 2 chars `\` `\`).

	return new_text

	print(f"Original len: {len(llm_output)}")
	fixed = fix_json_latex(llm_output)
	print(f"Fixed start: {fixed[:100]}...")

	try:
	data = json.loads(fixed)
	print("✅ Repair Success!")
	print(f"Question 1 Content: {data['questions'][0]['content'][:50]}...")
	except json.JSONDecodeError as e:
	print(f"❌ Repair Failed: {e}")