Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -222,21 +222,36 @@ Your task is to process OCR-extracted text from images of Scratch 3.0 code block
|
|
| 222 |
|
| 223 |
"""
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
SYSTEM_PROMPT_JSON_CORRECTOR = """
|
| 226 |
-
You are
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
Remember: Your output must be valid JSON only, ready to be parsed without errors.
|
| 240 |
"""
|
| 241 |
|
| 242 |
# debugger and resolver agent for Scratch 3.0
|
|
@@ -602,69 +617,101 @@ stack_opcodes_functionalities = "\n".join([
|
|
| 602 |
ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
|
| 603 |
|
| 604 |
# Helper function to extract JSON from LLM response
|
| 605 |
-
def extract_json_from_llm_response(raw_response: str) -> dict:
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
|
| 663 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
try:
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
raise
|
| 669 |
|
| 670 |
def reduce_image_size_to_limit(clean_b64_str: str, max_kb: int = 4000) -> str:
|
|
@@ -1096,18 +1143,29 @@ If you find any "Code-Blocks" then,
|
|
| 1096 |
|
| 1097 |
except json.JSONDecodeError as error_json:
|
| 1098 |
# If JSON parsing fails, use the json resolver agent
|
| 1099 |
-
correction_prompt = (
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
-
|
| 1106 |
-
|
| 1107 |
-
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1111 |
try:
|
| 1112 |
correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
|
| 1113 |
corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)
|
|
|
|
| 222 |
|
| 223 |
"""
|
| 224 |
|
| 225 |
+
# SYSTEM_PROMPT_JSON_CORRECTOR = """
|
| 226 |
+
# You are an assistant that outputs JSON responses strictly following the given schema.
|
| 227 |
+
# If the JSON you produce has any formatting errors, missing required fields, or invalid structure, you must identify the problems and correct them.
|
| 228 |
+
# Always return only valid JSON that fully conforms to the schema below, enclosed in triple backticks (```), without any extra text or explanation.
|
| 229 |
+
|
| 230 |
+
# If you receive an invalid or incomplete JSON response, fix it by:
|
| 231 |
+
# - Adding any missing required fields with appropriate values.
|
| 232 |
+
# - Correcting syntax errors such as missing commas, brackets, or quotes.
|
| 233 |
+
# - Ensuring the JSON structure matches the schema exactly.
|
| 234 |
+
# - Ensuring `"pseudocode"` is always a **single JSON string with embedded `\n` newlines** (never arrays, never concatenated with `+`).
|
| 235 |
+
# - Removing any invalid concatenation artifacts (`+`, `"string1" + "string2"`).
|
| 236 |
+
# - Never output explanations, comments, or extra text — only the corrected JSON.
|
| 237 |
+
# - **Every nested control structure (forever, repeat, if, if-else, etc.) must also have its own `end` placed at the correct depth, ensuring proper closure of each block. The placement of `end` is critical for differentiating script meaning (e.g., Case 1 vs Case 2 nesting).**
|
| 238 |
+
|
| 239 |
+
# Remember: Your output must be valid JSON only, ready to be parsed without errors.
|
| 240 |
+
# """
|
| 241 |
SYSTEM_PROMPT_JSON_CORRECTOR = """
|
| 242 |
+
You are a JSON correction assistant. Your ONLY task is to fix malformed JSON and return it in the correct format.
|
| 243 |
+
REQUIRED OUTPUT FORMAT:
|
| 244 |
+
{
|
| 245 |
+
"refined_logic": {
|
| 246 |
+
"name_variable": "sprite_name_here",
|
| 247 |
+
"pseudocode": "pseudocode_string_here"
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
RULES:
|
| 251 |
+
1. Extract the sprite name and pseudocode from the input
|
| 252 |
+
2. Return ONLY valid JSON in the exact format above
|
| 253 |
+
3. No explanations, no extra text, no other fields
|
| 254 |
+
4. If you can't find the data, use "Unknown" for name_variable and "No pseudocode found" for pseudocode
|
|
|
|
| 255 |
"""
|
| 256 |
|
| 257 |
# debugger and resolver agent for Scratch 3.0
|
|
|
|
| 617 |
ALL_SCRATCH_BLOCKS_CATALOG = _load_block_catalog(BLOCK_CATALOG_PATH)
|
| 618 |
|
| 619 |
# Helper function to extract JSON from LLM response
|
| 620 |
+
# def extract_json_from_llm_response(raw_response: str) -> dict:
|
| 621 |
+
# # --- 1) Pull out the JSON code‑block if present ---
|
| 622 |
+
# md = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", raw_response)
|
| 623 |
+
# json_string = md.group(1).strip() if md else raw_response
|
| 624 |
+
|
| 625 |
+
# # --- 2) Trim to the outermost { … } so we drop any prefix/suffix junk ---
|
| 626 |
+
# first, last = json_string.find('{'), json_string.rfind('}')
|
| 627 |
+
# if 0 <= first < last:
|
| 628 |
+
# json_string = json_string[first:last+1]
|
| 629 |
+
|
| 630 |
+
# # --- 3) PRE‑CLEANUP: remove stray assistant{…}, rogue assistant keys, fix boolean quotes ---
|
| 631 |
+
# json_string = re.sub(r'\b\w+\s*{', '{', json_string)
|
| 632 |
+
# json_string = re.sub(r'"assistant"\s*:', '', json_string)
|
| 633 |
+
# json_string = re.sub(r'\b(false|true)"', r'\1', json_string)
|
| 634 |
+
# logger.debug("Ran pre‑cleanup for stray tokens and boolean quotes.")
|
| 635 |
+
|
| 636 |
+
# # --- 3.1) Fix stray inner quotes at start of name/list values ---
|
| 637 |
+
# # e.g., { "name": " \"recent_scoress\"", ... } → "recent_scoress"
|
| 638 |
+
# json_string = re.sub(
|
| 639 |
+
# r'("name"\s*:\s*")\s*"',
|
| 640 |
+
# r'\1',
|
| 641 |
+
# json_string
|
| 642 |
+
# )
|
| 643 |
|
| 644 |
+
# # --- 4) Escape all embedded quotes in any `logic` value up to the next key ---
|
| 645 |
+
# def _esc(m):
|
| 646 |
+
# prefix, body = m.group(1), m.group(2)
|
| 647 |
+
# return prefix + body.replace('"', r'\"')
|
| 648 |
+
# json_string = re.sub(
|
| 649 |
+
# r'("logic"\s*:\s*")([\s\S]+?)(?=",\s*"[A-Za-z_]\w*"\s*:\s*)',
|
| 650 |
+
# _esc,
|
| 651 |
+
# json_string
|
| 652 |
+
# )
|
| 653 |
+
# logger.debug("Escaped embedded quotes in logic fields.")
|
| 654 |
+
|
| 655 |
+
# logger.debug("Quoted unquoted keys.")
|
| 656 |
+
|
| 657 |
+
# # --- 6) Remove trailing commas before } or ] ---
|
| 658 |
+
# json_string = re.sub(r',\s*(?=[}\],])', '', json_string)
|
| 659 |
+
# json_string = re.sub(r',\s*,', ',', json_string)
|
| 660 |
+
# logger.debug("Removed trailing commas.")
|
| 661 |
+
|
| 662 |
+
# # --- 7) Balance braces: drop extra } at end if needed ---
|
| 663 |
+
# ob, cb = json_string.count('{'), json_string.count('}')
|
| 664 |
+
# if cb > ob:
|
| 665 |
+
# excess = cb - ob
|
| 666 |
+
# json_string = json_string.rstrip()[:-excess]
|
| 667 |
+
# logger.debug(f"Stripped {excess} extra closing brace(s).")
|
| 668 |
+
|
| 669 |
+
# # --- 8) Escape literal newlines in *all* string values ---
|
| 670 |
+
# json_string = re.sub(
|
| 671 |
+
# r'"((?:[^"\\]|\\.)*?)"',
|
| 672 |
+
# lambda m: '"' + m.group(1).replace('\n', '\\n').replace('\r', '\\r') + '"',
|
| 673 |
+
# json_string,
|
| 674 |
+
# flags=re.DOTALL
|
| 675 |
+
# )
|
| 676 |
+
# logger.debug("Escaped newlines in strings.")
|
| 677 |
+
|
| 678 |
+
# # --- 9) Final parse attempt ---
|
| 679 |
+
# try:
|
| 680 |
+
# return json.loads(json_string)
|
| 681 |
+
# except json.JSONDecodeError:
|
| 682 |
+
# logger.error("Sanitized JSON still invalid:\n%s", json_string)
|
| 683 |
+
# raise
|
| 684 |
+
def extract_json_from_llm_response(raw_response: str) -> dict:
|
| 685 |
+
"""
|
| 686 |
+
Finds and parses the first valid JSON object from a raw LLM response string.
|
| 687 |
+
"""
|
| 688 |
+
logger.debug("Attempting to extract JSON from raw LLM response...")
|
| 689 |
|
| 690 |
+
# 1. Look for a JSON markdown block first
|
| 691 |
+
match = re.search(r"```(?:json)?\s*({[\s\S]*?})\s*```", raw_response)
|
| 692 |
+
if match:
|
| 693 |
+
json_string = match.group(1)
|
| 694 |
+
logger.debug("Found JSON inside a markdown block.")
|
| 695 |
+
try:
|
| 696 |
+
return json.loads(json_string)
|
| 697 |
+
except json.JSONDecodeError as e:
|
| 698 |
+
logger.warning(f"Failed to parse JSON from markdown block: {e}")
|
| 699 |
+
# Fall through to the next method if parsing fails
|
| 700 |
+
|
| 701 |
+
# 2. If no block is found (or it failed), find the outermost braces
|
| 702 |
+
logger.debug("Markdown block not found or failed. Searching for outermost braces.")
|
| 703 |
try:
|
| 704 |
+
first_brace = raw_response.find('{')
|
| 705 |
+
last_brace = raw_response.rfind('}')
|
| 706 |
+
if first_brace != -1 and last_brace != -1 and first_brace < last_brace:
|
| 707 |
+
json_string = raw_response[first_brace : last_brace + 1]
|
| 708 |
+
return json.loads(json_string)
|
| 709 |
+
else:
|
| 710 |
+
logger.error("Could not find a valid JSON structure (outermost braces).")
|
| 711 |
+
raise json.JSONDecodeError("No valid JSON object found in the response.", raw_response, 0)
|
| 712 |
+
except json.JSONDecodeError as e:
|
| 713 |
+
logger.error(f"Final JSON parsing attempt failed: {e}")
|
| 714 |
+
# Re-raise the exception to be caught by the calling logic (to invoke the corrector agent)
|
| 715 |
raise
|
| 716 |
|
| 717 |
def reduce_image_size_to_limit(clean_b64_str: str, max_kb: int = 4000) -> str:
|
|
|
|
| 1143 |
|
| 1144 |
except json.JSONDecodeError as error_json:
|
| 1145 |
# If JSON parsing fails, use the json resolver agent
|
| 1146 |
+
# correction_prompt = (
|
| 1147 |
+
# "Your task is to correct the provided JSON string to ensure it is **syntactically perfect and adheres strictly to JSON rules**.\n"
|
| 1148 |
+
# "It must be a JSON object with `refined_logic` (string) and `block_relationships` (array of objects).\n"
|
| 1149 |
+
# f"- **Error Details**: {error_json}\n\n"
|
| 1150 |
+
# "**Strict Instructions for your response:**\n"
|
| 1151 |
+
# "1. **ONLY** output the corrected JSON. Do not include any other text or explanations.\n"
|
| 1152 |
+
# "2. Ensure all keys and string values are enclosed in **double quotes**. Escape internal quotes (`\\`).\n"
|
| 1153 |
+
# "3. No trailing commas. Correct nesting.\n\n"
|
| 1154 |
+
# "Here is the problematic JSON string to correct:\n"
|
| 1155 |
+
# f"```json\n{llm_output_raw}\n```\n"
|
| 1156 |
+
# "Corrected JSON:\n"
|
| 1157 |
+
# )
|
| 1158 |
+
correction_prompt = f"""
|
| 1159 |
+
Fix this malformed response and return only the corrected JSON:
|
| 1160 |
+
Input: {llm_output_raw if 'llm_output_raw' in locals() else 'No response available'}
|
| 1161 |
+
Extract the sprite name and pseudocode, then return in this exact format:
|
| 1162 |
+
{{
|
| 1163 |
+
"refined_logic": {{
|
| 1164 |
+
"name_variable": "sprite_name",
|
| 1165 |
+
"pseudocode": "pseudocode_here"
|
| 1166 |
+
}}
|
| 1167 |
+
}}
|
| 1168 |
+
"""
|
| 1169 |
try:
|
| 1170 |
correction_response = agent_json_resolver.invoke({"messages": [{"role": "user", "content": correction_prompt}]})
|
| 1171 |
corrected_output = extract_json_from_llm_response(correction_response['messages'][-1].content)
|